From: Andy Heninger Date: Fri, 26 Feb 2016 21:58:26 +0000 (+0000) Subject: ICU-12081 Initial implementation Emoji break rules and a new RBBI monkey test. X-Git-Tag: milestone-59-0-1~636 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9d9256f3b792100cda697c7bcf52bacfbc3bca87;p=icu ICU-12081 Initial implementation Emoji break rules and a new RBBI monkey test. X-SVN-Rev: 38387 --- diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp index 6f3df2c4b70..fe27acc8af6 100644 --- a/icu4c/source/common/rbbi.cpp +++ b/icu4c/source/common/rbbi.cpp @@ -983,6 +983,54 @@ enum RBBIRunMode { }; +// Map from look-ahead break states (corresponds to rules) to boundary positions. +// Allows multiple lookahead break rules to be in flight at the same time. +// +// This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers +// in the state table be sequential, then we can just index an array. And the +// table could also tell us in advance how big that array needs to be. +// +// Before ICU 57 there was just a single simple variable for a look-ahead match that +// was in progress. Two rules at once did not work. + +static const int32_t kMaxLookaheads = 8; +struct LookAheadResults { + int32_t fUsedSlotLimit; + int32_t fPositions[8]; + int16_t fKeys[8]; + + LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {}; + + int32_t getPosition(int16_t key) { + for (int32_t i=0; i= kMaxLookaheads) { + U_ASSERT(FALSE); + i = kMaxLookaheads - 1; + } + fKeys[i] = key; + fPositions[i] = position; + U_ASSERT(fUsedSlotLimit == i); + fUsedSlotLimit = i + 1; + } +}; + + //----------------------------------------------------------------------------------- // // handleNext(stateTable) @@ -1000,14 +1048,11 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { RBBIStateTableRow *row; UChar32 c; - int32_t lookaheadStatus = 0; - int32_t lookaheadTagIdx = 0; - int32_t result = 0; - int32_t initialPosition = 0; - int32_t lookaheadResult = 0; - UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; - const char *tableData = statetable->fTableData; - uint32_t tableRowLen = statetable->fRowLen; + LookAheadResults lookAheadMatches; + int32_t result = 0; + int32_t initialPosition = 0; + const char *tableData = statetable->fTableData; + uint32_t tableRowLen = statetable->fRowLen; #ifdef RBBI_DEBUG if (fTrace) { @@ -1050,14 +1095,6 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { // We have already run the loop one last time with the // character set to the psueudo {eof} value. Now it is time // to unconditionally bail out. - if (lookaheadResult > result) { - // We ran off the end of the string with a pending look-ahead match. - // Treat this as if the look-ahead condition had been met, and return - // the match at the / position from the look-ahead rule. - result = lookaheadResult; - fLastRuleStatusIndex = lookaheadTagIdx; - lookaheadStatus = 0; - } break; } // Run the loop one last time with the fake end-of-input character category. @@ -1123,38 +1160,23 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. } - if (row->fLookAhead != 0) { - if (lookaheadStatus != 0 - && row->fAccepting == lookaheadStatus) { - // Lookahead match is completed. - result = lookaheadResult; - fLastRuleStatusIndex = lookaheadTagIdx; - lookaheadStatus = 0; - // TODO: make a standalone hard break in a rule work. - if (lookAheadHardBreak) { - UTEXT_SETNATIVEINDEX(fText, result); - return result; - } - // Look-ahead completed, but other rules may match further. Continue on - // TODO: junk this feature? I don't think it's used anywhwere. - goto continueOn; + int16_t completedRule = row->fAccepting; + if (completedRule > 0) { + // Lookahead match is completed. + int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule); + if (lookaheadResult >= 0) { + fLastRuleStatusIndex = row->fTagIdx; + UTEXT_SETNATIVEINDEX(fText, lookaheadResult); + return lookaheadResult; } - - int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); - lookaheadResult = r; - lookaheadStatus = row->fLookAhead; - lookaheadTagIdx = row->fTagIdx; - goto continueOn; } - - - if (row->fAccepting != 0) { - // Because this is an accepting state, any in-progress look-ahead match - // is no longer relavant. Clear out the pending lookahead status. - lookaheadStatus = 0; // clear out any pending look-ahead match. + int16_t rule = row->fLookAhead; + if (rule != 0) { + // At the position of a '/' in a look-ahead match. Record it. + int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); + lookAheadMatches.setPosition(rule, pos); } -continueOn: if (state == STOP_STATE) { // This is the normal exit from the lookup state machine. // We have advanced through the string until it is certain that no @@ -1216,11 +1238,9 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) RBBIRunMode mode; RBBIStateTableRow *row; UChar32 c; - int32_t lookaheadStatus = 0; + LookAheadResults lookAheadMatches; int32_t result = 0; int32_t initialPosition = 0; - int32_t lookaheadResult = 0; - UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; #ifdef RBBI_DEBUG if (fTrace) { @@ -1266,13 +1286,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) // We have already run the loop one last time with the // character set to the psueudo {eof} value. Now it is time // to unconditionally bail out. - if (lookaheadResult < result) { - // We ran off the end of the string with a pending look-ahead match. - // Treat this as if the look-ahead condition had been met, and return - // the match at the / position from the look-ahead rule. - result = lookaheadResult; - lookaheadStatus = 0; - } else if (result == initialPosition) { + if (result == initialPosition) { // Ran off start, no match found. // move one index one (towards the start, since we are doing a previous()) UTEXT_SETNATIVEINDEX(fText, initialPosition); @@ -1338,36 +1352,22 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) result = (int32_t)UTEXT_GETNATIVEINDEX(fText); } - if (row->fLookAhead != 0) { - if (lookaheadStatus != 0 - && row->fAccepting == lookaheadStatus) { - // Lookahead match is completed. - result = lookaheadResult; - lookaheadStatus = 0; - // TODO: make a standalone hard break in a rule work. - if (lookAheadHardBreak) { - UTEXT_SETNATIVEINDEX(fText, result); - return result; - } - // Look-ahead completed, but other rules may match further. Continue on - // TODO: junk this feature? I don't think it's used anywhwere. - goto continueOn; + int16_t completedRule = row->fAccepting; + if (completedRule > 0) { + // Lookahead match is completed. + int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule); + if (lookaheadResult >= 0) { + UTEXT_SETNATIVEINDEX(fText, lookaheadResult); + return lookaheadResult; } - - int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); - lookaheadResult = r; - lookaheadStatus = row->fLookAhead; - goto continueOn; } - - - if (row->fAccepting != 0) { - // Because this is an accepting state, any in-progress look-ahead match - // is no longer relavant. Clear out the pending lookahead status. - lookaheadStatus = 0; + int16_t rule = row->fLookAhead; + if (rule != 0) { + // At the position of a '/' in a look-ahead match. Record it. + int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); + lookAheadMatches.setPosition(rule, pos); } -continueOn: if (state == STOP_STATE) { // This is the normal exit from the lookup state machine. // We have advanced through the string until it is certain that no diff --git a/icu4c/source/common/rbbinode.cpp b/icu4c/source/common/rbbinode.cpp index 49e0ad3dfa4..1468be9c10c 100644 --- a/icu4c/source/common/rbbinode.cpp +++ b/icu4c/source/common/rbbinode.cpp @@ -1,6 +1,6 @@ /* *************************************************************************** -* Copyright (C) 2002-2008 International Business Machines Corporation * +* Copyright (C) 2002-2016 International Business Machines Corporation * * and others. All rights reserved. * *************************************************************************** */ @@ -56,6 +56,8 @@ RBBINode::RBBINode(NodeType t) : UMemory() { fLastPos = 0; fNullable = FALSE; fLookAheadEnd = FALSE; + fRuleRoot = FALSE; + fChainIn = FALSE; fVal = 0; fPrecedence = precZero; @@ -86,6 +88,8 @@ RBBINode::RBBINode(const RBBINode &other) : UMemory(other) { fLastPos = other.fLastPos; fNullable = other.fNullable; fVal = other.fVal; + fRuleRoot = FALSE; + fChainIn = other.fChainIn; UErrorCode status = U_ZERO_ERROR; fFirstPosSet = new UVector(status); // TODO - get a real status from somewhere fLastPosSet = new UVector(status); @@ -161,6 +165,8 @@ RBBINode *RBBINode::cloneTree() { } } } + n->fRuleRoot = this->fRuleRoot; + n->fChainIn = this->fChainIn; return n; } @@ -272,6 +278,12 @@ void RBBINode::findNodes(UVector *dest, RBBINode::NodeType kind, UErrorCode &s // //------------------------------------------------------------------------- #ifdef RBBI_DEBUG + +static int32_t serial(const RBBINode *node) { + return (node == NULL? -1 : node->fSerialNum); +} + + void RBBINode::printNode() { static const char * const nodeTypeNames[] = { "setRef", @@ -295,9 +307,10 @@ void RBBINode::printNode() { if (this==NULL) { RBBIDebugPrintf("%10p", (void *)this); } else { - RBBIDebugPrintf("%10p %12s %10p %10p %10p %4d %6d %d ", - (void *)this, nodeTypeNames[fType], (void *)fParent, (void *)fLeftChild, (void *)fRightChild, - fSerialNum, fFirstPos, fVal); + RBBIDebugPrintf("%10p %5d %12s %c%c %5d %5d %5d %6d %d ", + (void *)this, fSerialNum, nodeTypeNames[fType], fRuleRoot?'R':' ', fChainIn?'C':' ', + serial(fLeftChild), serial(fRightChild), serial(fParent), + fFirstPos, fVal); if (fType == varRef) { RBBI_DEBUG_printUnicodeString(fText); } @@ -328,11 +341,13 @@ U_CFUNC void RBBI_DEBUG_printUnicodeString(const UnicodeString &s, int minWidth) // //------------------------------------------------------------------------- #ifdef RBBI_DEBUG +void RBBINode::printNodeHeader() { + RBBIDebugPrintf(" Address serial type LeftChild RightChild Parent position value\n"); +} + void RBBINode::printTree(UBool printHeading) { if (printHeading) { - RBBIDebugPrintf( "-------------------------------------------------------------------\n" - " Address type Parent LeftChild RightChild serial position value\n" - ); + printNodeHeader(); } this->printNode(); if (this != NULL) { diff --git a/icu4c/source/common/rbbinode.h b/icu4c/source/common/rbbinode.h index 8f7890bd8e3..babd312640f 100644 --- a/icu4c/source/common/rbbinode.h +++ b/icu4c/source/common/rbbinode.h @@ -80,6 +80,10 @@ class RBBINode : public UMemory { UBool fLookAheadEnd; // For endMark nodes, set TRUE if // marking the end of a look-ahead rule. + UBool fRuleRoot; // True if this node is the root of a rule. + UBool fChainIn; // True if chaining into this rule is allowed + // (no '^' present). + UVector *fFirstPosSet; UVector *fLastPosSet; // TODO: rename fFirstPos & fLastPos to avoid confusion. UVector *fFollowPos; @@ -95,6 +99,7 @@ class RBBINode : public UMemory { void findNodes(UVector *dest, RBBINode::NodeType kind, UErrorCode &status); #ifdef RBBI_DEBUG + static void printNodeHeader(); void printNode(); void printTree(UBool withHeading); #endif @@ -104,6 +109,7 @@ class RBBINode : public UMemory { UBool operator == (const RBBINode &other); // Private, so these functions won't accidently be used. #ifdef RBBI_DEBUG + public: int fSerialNum; // Debugging aids. #endif }; diff --git a/icu4c/source/common/rbbirpt.h b/icu4c/source/common/rbbirpt.h index b6c8795c35d..e85cce3cc09 100644 --- a/icu4c/source/common/rbbirpt.h +++ b/icu4c/source/common/rbbirpt.h @@ -40,6 +40,7 @@ enum RBBI_RuleParseAction { doExprStart, doLParen, doNOP, + doNoChain, doOptionEnd, doOptionStart, doReverseDir, @@ -77,101 +78,109 @@ struct RBBIRuleTableEl { static const struct RBBIRuleTableEl gRuleParseStateTable[] = { {doNOP, 0, 0, 0, TRUE} - , {doExprStart, 254, 21, 8, FALSE} // 1 start + , {doExprStart, 254, 29, 9, FALSE} // 1 start , {doNOP, 132, 1,0, TRUE} // 2 - , {doExprStart, 36 /* $ */, 80, 90, FALSE} // 3 - , {doNOP, 33 /* ! */, 11,0, TRUE} // 4 - , {doNOP, 59 /* ; */, 1,0, TRUE} // 5 - , {doNOP, 252, 0,0, FALSE} // 6 - , {doExprStart, 255, 21, 8, FALSE} // 7 - , {doEndOfRule, 59 /* ; */, 1,0, TRUE} // 8 break-rule-end - , {doNOP, 132, 8,0, TRUE} // 9 - , {doRuleError, 255, 95,0, FALSE} // 10 - , {doNOP, 33 /* ! */, 13,0, TRUE} // 11 rev-option - , {doReverseDir, 255, 20, 8, FALSE} // 12 - , {doOptionStart, 130, 15,0, TRUE} // 13 option-scan1 - , {doRuleError, 255, 95,0, FALSE} // 14 - , {doNOP, 129, 15,0, TRUE} // 15 option-scan2 - , {doOptionEnd, 255, 17,0, FALSE} // 16 - , {doNOP, 59 /* ; */, 1,0, TRUE} // 17 option-scan3 - , {doNOP, 132, 17,0, TRUE} // 18 - , {doRuleError, 255, 95,0, FALSE} // 19 - , {doExprStart, 255, 21, 8, FALSE} // 20 reverse-rule - , {doRuleChar, 254, 30,0, TRUE} // 21 term - , {doNOP, 132, 21,0, TRUE} // 22 - , {doRuleChar, 131, 30,0, TRUE} // 23 - , {doNOP, 91 /* [ */, 86, 30, FALSE} // 24 - , {doLParen, 40 /* ( */, 21, 30, TRUE} // 25 - , {doNOP, 36 /* $ */, 80, 29, FALSE} // 26 - , {doDotAny, 46 /* . */, 30,0, TRUE} // 27 - , {doRuleError, 255, 95,0, FALSE} // 28 - , {doCheckVarDef, 255, 30,0, FALSE} // 29 term-var-ref - , {doNOP, 132, 30,0, TRUE} // 30 expr-mod - , {doUnaryOpStar, 42 /* * */, 35,0, TRUE} // 31 - , {doUnaryOpPlus, 43 /* + */, 35,0, TRUE} // 32 - , {doUnaryOpQuestion, 63 /* ? */, 35,0, TRUE} // 33 - , {doNOP, 255, 35,0, FALSE} // 34 - , {doExprCatOperator, 254, 21,0, FALSE} // 35 expr-cont - , {doNOP, 132, 35,0, TRUE} // 36 - , {doExprCatOperator, 131, 21,0, FALSE} // 37 - , {doExprCatOperator, 91 /* [ */, 21,0, FALSE} // 38 - , {doExprCatOperator, 40 /* ( */, 21,0, FALSE} // 39 - , {doExprCatOperator, 36 /* $ */, 21,0, FALSE} // 40 - , {doExprCatOperator, 46 /* . */, 21,0, FALSE} // 41 - , {doExprCatOperator, 47 /* / */, 47,0, FALSE} // 42 - , {doExprCatOperator, 123 /* { */, 59,0, TRUE} // 43 - , {doExprOrOperator, 124 /* | */, 21,0, TRUE} // 44 - , {doExprRParen, 41 /* ) */, 255,0, TRUE} // 45 - , {doExprFinished, 255, 255,0, FALSE} // 46 - , {doSlash, 47 /* / */, 49,0, TRUE} // 47 look-ahead - , {doNOP, 255, 95,0, FALSE} // 48 - , {doExprCatOperator, 254, 21,0, FALSE} // 49 expr-cont-no-slash - , {doNOP, 132, 35,0, TRUE} // 50 - , {doExprCatOperator, 131, 21,0, FALSE} // 51 - , {doExprCatOperator, 91 /* [ */, 21,0, FALSE} // 52 - , {doExprCatOperator, 40 /* ( */, 21,0, FALSE} // 53 - , {doExprCatOperator, 36 /* $ */, 21,0, FALSE} // 54 - , {doExprCatOperator, 46 /* . */, 21,0, FALSE} // 55 - , {doExprOrOperator, 124 /* | */, 21,0, TRUE} // 56 - , {doExprRParen, 41 /* ) */, 255,0, TRUE} // 57 - , {doExprFinished, 255, 255,0, FALSE} // 58 - , {doNOP, 132, 59,0, TRUE} // 59 tag-open - , {doStartTagValue, 128, 62,0, FALSE} // 60 - , {doTagExpectedError, 255, 95,0, FALSE} // 61 - , {doNOP, 132, 66,0, TRUE} // 62 tag-value - , {doNOP, 125 /* } */, 66,0, FALSE} // 63 - , {doTagDigit, 128, 62,0, TRUE} // 64 - , {doTagExpectedError, 255, 95,0, FALSE} // 65 - , {doNOP, 132, 66,0, TRUE} // 66 tag-close - , {doTagValue, 125 /* } */, 69,0, TRUE} // 67 - , {doTagExpectedError, 255, 95,0, FALSE} // 68 - , {doExprCatOperator, 254, 21,0, FALSE} // 69 expr-cont-no-tag - , {doNOP, 132, 69,0, TRUE} // 70 - , {doExprCatOperator, 131, 21,0, FALSE} // 71 - , {doExprCatOperator, 91 /* [ */, 21,0, FALSE} // 72 - , {doExprCatOperator, 40 /* ( */, 21,0, FALSE} // 73 - , {doExprCatOperator, 36 /* $ */, 21,0, FALSE} // 74 - , {doExprCatOperator, 46 /* . */, 21,0, FALSE} // 75 - , {doExprCatOperator, 47 /* / */, 47,0, FALSE} // 76 - , {doExprOrOperator, 124 /* | */, 21,0, TRUE} // 77 - , {doExprRParen, 41 /* ) */, 255,0, TRUE} // 78 - , {doExprFinished, 255, 255,0, FALSE} // 79 - , {doStartVariableName, 36 /* $ */, 82,0, TRUE} // 80 scan-var-name - , {doNOP, 255, 95,0, FALSE} // 81 - , {doNOP, 130, 84,0, TRUE} // 82 scan-var-start - , {doVariableNameExpectedErr, 255, 95,0, FALSE} // 83 - , {doNOP, 129, 84,0, TRUE} // 84 scan-var-body - , {doEndVariableName, 255, 255,0, FALSE} // 85 - , {doScanUnicodeSet, 91 /* [ */, 255,0, TRUE} // 86 scan-unicode-set - , {doScanUnicodeSet, 112 /* p */, 255,0, TRUE} // 87 - , {doScanUnicodeSet, 80 /* P */, 255,0, TRUE} // 88 - , {doNOP, 255, 95,0, FALSE} // 89 - , {doNOP, 132, 90,0, TRUE} // 90 assign-or-rule - , {doStartAssign, 61 /* = */, 21, 93, TRUE} // 91 - , {doNOP, 255, 29, 8, FALSE} // 92 - , {doEndAssign, 59 /* ; */, 1,0, TRUE} // 93 assign-end - , {doRuleErrorAssignExpr, 255, 95,0, FALSE} // 94 - , {doExit, 255, 95,0, TRUE} // 95 errorDeath + , {doNoChain, 94 /* ^ */, 12, 9, TRUE} // 3 + , {doExprStart, 36 /* $ */, 88, 98, FALSE} // 4 + , {doNOP, 33 /* ! */, 19,0, TRUE} // 5 + , {doNOP, 59 /* ; */, 1,0, TRUE} // 6 + , {doNOP, 252, 0,0, FALSE} // 7 + , {doExprStart, 255, 29, 9, FALSE} // 8 + , {doEndOfRule, 59 /* ; */, 1,0, TRUE} // 9 break-rule-end + , {doNOP, 132, 9,0, TRUE} // 10 + , {doRuleError, 255, 103,0, FALSE} // 11 + , {doExprStart, 254, 29,0, FALSE} // 12 start-after-caret + , {doNOP, 132, 12,0, TRUE} // 13 + , {doRuleError, 94 /* ^ */, 103,0, FALSE} // 14 + , {doExprStart, 36 /* $ */, 88, 37, FALSE} // 15 + , {doRuleError, 59 /* ; */, 103,0, FALSE} // 16 + , {doRuleError, 252, 103,0, FALSE} // 17 + , {doExprStart, 255, 29,0, FALSE} // 18 + , {doNOP, 33 /* ! */, 21,0, TRUE} // 19 rev-option + , {doReverseDir, 255, 28, 9, FALSE} // 20 + , {doOptionStart, 130, 23,0, TRUE} // 21 option-scan1 + , {doRuleError, 255, 103,0, FALSE} // 22 + , {doNOP, 129, 23,0, TRUE} // 23 option-scan2 + , {doOptionEnd, 255, 25,0, FALSE} // 24 + , {doNOP, 59 /* ; */, 1,0, TRUE} // 25 option-scan3 + , {doNOP, 132, 25,0, TRUE} // 26 + , {doRuleError, 255, 103,0, FALSE} // 27 + , {doExprStart, 255, 29, 9, FALSE} // 28 reverse-rule + , {doRuleChar, 254, 38,0, TRUE} // 29 term + , {doNOP, 132, 29,0, TRUE} // 30 + , {doRuleChar, 131, 38,0, TRUE} // 31 + , {doNOP, 91 /* [ */, 94, 38, FALSE} // 32 + , {doLParen, 40 /* ( */, 29, 38, TRUE} // 33 + , {doNOP, 36 /* $ */, 88, 37, FALSE} // 34 + , {doDotAny, 46 /* . */, 38,0, TRUE} // 35 + , {doRuleError, 255, 103,0, FALSE} // 36 + , {doCheckVarDef, 255, 38,0, FALSE} // 37 term-var-ref + , {doNOP, 132, 38,0, TRUE} // 38 expr-mod + , {doUnaryOpStar, 42 /* * */, 43,0, TRUE} // 39 + , {doUnaryOpPlus, 43 /* + */, 43,0, TRUE} // 40 + , {doUnaryOpQuestion, 63 /* ? */, 43,0, TRUE} // 41 + , {doNOP, 255, 43,0, FALSE} // 42 + , {doExprCatOperator, 254, 29,0, FALSE} // 43 expr-cont + , {doNOP, 132, 43,0, TRUE} // 44 + , {doExprCatOperator, 131, 29,0, FALSE} // 45 + , {doExprCatOperator, 91 /* [ */, 29,0, FALSE} // 46 + , {doExprCatOperator, 40 /* ( */, 29,0, FALSE} // 47 + , {doExprCatOperator, 36 /* $ */, 29,0, FALSE} // 48 + , {doExprCatOperator, 46 /* . */, 29,0, FALSE} // 49 + , {doExprCatOperator, 47 /* / */, 55,0, FALSE} // 50 + , {doExprCatOperator, 123 /* { */, 67,0, TRUE} // 51 + , {doExprOrOperator, 124 /* | */, 29,0, TRUE} // 52 + , {doExprRParen, 41 /* ) */, 255,0, TRUE} // 53 + , {doExprFinished, 255, 255,0, FALSE} // 54 + , {doSlash, 47 /* / */, 57,0, TRUE} // 55 look-ahead + , {doNOP, 255, 103,0, FALSE} // 56 + , {doExprCatOperator, 254, 29,0, FALSE} // 57 expr-cont-no-slash + , {doNOP, 132, 43,0, TRUE} // 58 + , {doExprCatOperator, 131, 29,0, FALSE} // 59 + , {doExprCatOperator, 91 /* [ */, 29,0, FALSE} // 60 + , {doExprCatOperator, 40 /* ( */, 29,0, FALSE} // 61 + , {doExprCatOperator, 36 /* $ */, 29,0, FALSE} // 62 + , {doExprCatOperator, 46 /* . */, 29,0, FALSE} // 63 + , {doExprOrOperator, 124 /* | */, 29,0, TRUE} // 64 + , {doExprRParen, 41 /* ) */, 255,0, TRUE} // 65 + , {doExprFinished, 255, 255,0, FALSE} // 66 + , {doNOP, 132, 67,0, TRUE} // 67 tag-open + , {doStartTagValue, 128, 70,0, FALSE} // 68 + , {doTagExpectedError, 255, 103,0, FALSE} // 69 + , {doNOP, 132, 74,0, TRUE} // 70 tag-value + , {doNOP, 125 /* } */, 74,0, FALSE} // 71 + , {doTagDigit, 128, 70,0, TRUE} // 72 + , {doTagExpectedError, 255, 103,0, FALSE} // 73 + , {doNOP, 132, 74,0, TRUE} // 74 tag-close + , {doTagValue, 125 /* } */, 77,0, TRUE} // 75 + , {doTagExpectedError, 255, 103,0, FALSE} // 76 + , {doExprCatOperator, 254, 29,0, FALSE} // 77 expr-cont-no-tag + , {doNOP, 132, 77,0, TRUE} // 78 + , {doExprCatOperator, 131, 29,0, FALSE} // 79 + , {doExprCatOperator, 91 /* [ */, 29,0, FALSE} // 80 + , {doExprCatOperator, 40 /* ( */, 29,0, FALSE} // 81 + , {doExprCatOperator, 36 /* $ */, 29,0, FALSE} // 82 + , {doExprCatOperator, 46 /* . */, 29,0, FALSE} // 83 + , {doExprCatOperator, 47 /* / */, 55,0, FALSE} // 84 + , {doExprOrOperator, 124 /* | */, 29,0, TRUE} // 85 + , {doExprRParen, 41 /* ) */, 255,0, TRUE} // 86 + , {doExprFinished, 255, 255,0, FALSE} // 87 + , {doStartVariableName, 36 /* $ */, 90,0, TRUE} // 88 scan-var-name + , {doNOP, 255, 103,0, FALSE} // 89 + , {doNOP, 130, 92,0, TRUE} // 90 scan-var-start + , {doVariableNameExpectedErr, 255, 103,0, FALSE} // 91 + , {doNOP, 129, 92,0, TRUE} // 92 scan-var-body + , {doEndVariableName, 255, 255,0, FALSE} // 93 + , {doScanUnicodeSet, 91 /* [ */, 255,0, TRUE} // 94 scan-unicode-set + , {doScanUnicodeSet, 112 /* p */, 255,0, TRUE} // 95 + , {doScanUnicodeSet, 80 /* P */, 255,0, TRUE} // 96 + , {doNOP, 255, 103,0, FALSE} // 97 + , {doNOP, 132, 98,0, TRUE} // 98 assign-or-rule + , {doStartAssign, 61 /* = */, 29, 101, TRUE} // 99 + , {doNOP, 255, 37, 9, FALSE} // 100 + , {doEndAssign, 59 /* ; */, 1,0, TRUE} // 101 assign-end + , {doRuleErrorAssignExpr, 255, 103,0, FALSE} // 102 + , {doExit, 255, 103,0, TRUE} // 103 errorDeath }; #ifdef RBBI_DEBUG static const char * const RBBIRuleStateNames[] = { 0, @@ -181,9 +190,17 @@ static const char * const RBBIRuleStateNames[] = { 0, 0, 0, 0, + 0, 0, "break-rule-end", 0, + 0, + "start-after-caret", + 0, + 0, + 0, + 0, + 0, 0, "rev-option", 0, diff --git a/icu4c/source/common/rbbirpt.txt b/icu4c/source/common/rbbirpt.txt index 8e932a6037a..fd00a83ae86 100644 --- a/icu4c/source/common/rbbirpt.txt +++ b/icu4c/source/common/rbbirpt.txt @@ -1,7 +1,7 @@ #***************************************************************************** # -# Copyright (C) 2002-2003, International Business Machines Corporation and others. +# Copyright (C) 2002-2016, International Business Machines Corporation and others. # All Rights Reserved. # #***************************************************************************** @@ -19,6 +19,7 @@ # This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays # that are then built with the rule parser. # +# perl rbbicst.pl < rbbirpt.txt > rbbirpt.h # # Here is the syntax of the state definitions in this file: @@ -57,6 +58,7 @@ start: escaped term ^break-rule-end doExprStart white_space n start + '^' n start-after-caret ^break-rule-end doNoChain '$' scan-var-name ^assign-or-rule doExprStart '!' n rev-option ';' n start # ignore empty rules. @@ -71,7 +73,21 @@ break-rule-end: white_space n break-rule-end default errorDeath doRuleError - +# +# start of a rule, after having seen a '^' (inhibits rule chain in). +# Similar to the main 'start' state in most respects, except +# - empty rule is an error. +# - A second '^' is an error. +# +start-after-caret: + escaped term doExprStart + white_space n start-after-caret + '^' errorDeath doRuleError # two '^'s + '$' scan-var-name ^term-var-ref doExprStart + ';' errorDeath doRuleError # ^ ; + eof errorDeath doRuleError + default term doExprStart + # # ! We've just scanned a '!', indicating either a !!key word flag or a # !Reverse rule. diff --git a/icu4c/source/common/rbbiscan.cpp b/icu4c/source/common/rbbiscan.cpp index 1dc6b704d01..767a24c399f 100644 --- a/icu4c/source/common/rbbiscan.cpp +++ b/icu4c/source/common/rbbiscan.cpp @@ -1,7 +1,7 @@ // // file: rbbiscan.cpp // -// Copyright (C) 2002-2015, International Business Machines Corporation and others. +// Copyright (C) 2002-2016, International Business Machines Corporation and others. // All Rights Reserved. // // This file contains the Rule Based Break Iterator Rule Builder functions for @@ -87,24 +87,27 @@ U_NAMESPACE_BEGIN RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb) { fRB = rb; + fScanIndex = 0; + fNextIndex = 0; + fQuoteMode = FALSE; + fLineNum = 1; + fCharNum = 0; + fLastChar = 0; + + fStateTable = NULL; + fStack[0] = 0; fStackPtr = 0; - fStack[fStackPtr] = 0; - fNodeStackPtr = 0; - fRuleNum = 0; fNodeStack[0] = NULL; - - fSymbolTable = NULL; - fSetTable = NULL; - - fScanIndex = 0; - fNextIndex = 0; + fNodeStackPtr = 0; fReverseRule = FALSE; fLookAheadRule = FALSE; + fNoChainInRule = FALSE; - fLineNum = 1; - fCharNum = 0; - fQuoteMode = FALSE; + fSymbolTable = NULL; + fSetTable = NULL; + fRuleNum = 0; + fOptionStart = 0; // Do not check status until after all critical fields are sufficiently initialized // that the destructor can run cleanly. @@ -205,6 +208,12 @@ UBool RBBIRuleScanner::doParseActions(int32_t action) break; + case doNoChain: + // Scanned a '^' while on the rule start state. + fNoChainInRule = TRUE; + break; + + case doExprOrOperator: { fixOpStack(RBBINode::precOpCat); @@ -318,11 +327,11 @@ UBool RBBIRuleScanner::doParseActions(int32_t action) if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");} #endif U_ASSERT(fNodeStackPtr == 1); + RBBINode *thisRule = fNodeStack[fNodeStackPtr]; // If this rule includes a look-ahead '/', add a endMark node to the // expression tree. if (fLookAheadRule) { - RBBINode *thisRule = fNodeStack[fNodeStackPtr]; RBBINode *endNode = pushNewNode(RBBINode::endMark); RBBINode *catNode = pushNewNode(RBBINode::opCat); if (U_FAILURE(*fRB->fStatus)) { @@ -334,8 +343,24 @@ UBool RBBIRuleScanner::doParseActions(int32_t action) fNodeStack[fNodeStackPtr] = catNode; endNode->fVal = fRuleNum; endNode->fLookAheadEnd = TRUE; + thisRule = catNode; + + // TODO: Disable chaining out of look-ahead (hard break) rules. + // The break on rule match is forced, so there is no point in building up + // the state table to chain into another rule for a longer match. } + // Mark this node as being the root of a rule. + thisRule->fRuleRoot = TRUE; + + // Flag if chaining into this rule is wanted. + // + if (fRB->fChainRules && // If rule chaining is enabled globally via !!chain + !fNoChainInRule) { // and no '^' chain-in inhibit was on this rule + thisRule->fChainIn = TRUE; + } + + // All rule expressions are ORed together. // The ';' that terminates an expression really just functions as a '|' with // a low operator prededence. @@ -372,6 +397,7 @@ UBool RBBIRuleScanner::doParseActions(int32_t action) } fReverseRule = FALSE; // in preparation for the next rule. fLookAheadRule = FALSE; + fNoChainInRule = FALSE; fNodeStackPtr = 0; } break; @@ -994,7 +1020,7 @@ void RBBIRuleScanner::parse() { for (;;) { #ifdef RBBI_DEBUG - if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf(".");} + if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf("."); fflush(stdout);} #endif if (tableEl->fCharClass < 127 && fC.fEscaped == FALSE && tableEl->fCharClass == fC.fChar) { // Table row specified an individual character, not a set, and diff --git a/icu4c/source/common/rbbiscan.h b/icu4c/source/common/rbbiscan.h index f1f9fd71f1e..a36e4fd92af 100644 --- a/icu4c/source/common/rbbiscan.h +++ b/icu4c/source/common/rbbiscan.h @@ -52,6 +52,7 @@ public: struct RBBIRuleChar { UChar32 fChar; UBool fEscaped; + RBBIRuleChar() : fChar(0), fEscaped(FALSE) {}; }; RBBIRuleScanner(RBBIRuleBuilder *rb); @@ -127,6 +128,8 @@ private: UBool fLookAheadRule; // True if the rule includes a '/' // somewhere within it. + UBool fNoChainInRule; // True if the current rule starts with a '^'. + RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of // $variable symbols. diff --git a/icu4c/source/common/rbbitblb.cpp b/icu4c/source/common/rbbitblb.cpp index 2ce82dfed18..09ec3ce4631 100644 --- a/icu4c/source/common/rbbitblb.cpp +++ b/icu4c/source/common/rbbitblb.cpp @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (c) 2002-2009, International Business Machines +* Copyright (c) 2002-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -78,7 +78,7 @@ void RBBITableBuilder::build() { fTree = fTree->flattenVariables(); #ifdef RBBI_DEBUG if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) { - RBBIDebugPuts("Parse tree after flattening variable references."); + RBBIDebugPuts("\nParse tree after flattening variable references."); fTree->printTree(TRUE); } #endif @@ -136,7 +136,7 @@ void RBBITableBuilder::build() { fTree->flattenSets(); #ifdef RBBI_DEBUG if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) { - RBBIDebugPuts("Parse tree after flattening Unicode Set references."); + RBBIDebugPuts("\nParse tree after flattening Unicode Set references."); fTree->printTree(TRUE); } #endif @@ -375,6 +375,25 @@ void RBBITableBuilder::calcFollowPos(RBBINode *n) { } +//----------------------------------------------------------------------------- +// +// addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged +// as roots of a rule to a destination vector. +// +//----------------------------------------------------------------------------- +void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) { + if (node == NULL || U_FAILURE(*fStatus)) { + return; + } + if (node->fRuleRoot) { + dest->addElement(node, *fStatus); + // Note: rules cannot nest. If we found a rule start node, + // no child node can also be a start node. + return; + } + addRuleRootNodes(dest, node->fLeftChild); + addRuleRootNodes(dest, node->fRightChild); +} //----------------------------------------------------------------------------- // @@ -401,19 +420,24 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) { return; } - // Get all nodes that can be the start a match, which is FirstPosition() - // of the portion of the tree corresponding to user-written rules. - // See the tree description in bofFixup(). - RBBINode *userRuleRoot = tree; - if (fRB->fSetBuilder->sawBOF()) { - userRuleRoot = tree->fLeftChild->fRightChild; - } - U_ASSERT(userRuleRoot != NULL); - UVector *matchStartNodes = userRuleRoot->fFirstPosSet; + // Collect all leaf nodes that can start matches for rules + // with inbound chaining enabled, which is the union of the + // firstPosition sets from each of the rule root nodes. + + UVector ruleRootNodes(*fStatus); + addRuleRootNodes(&ruleRootNodes, tree); + UVector matchStartNodes(*fStatus); + for (int i=0; i(ruleRootNodes.elementAt(i)); + if (node->fChainIn) { + setAdd(&matchStartNodes, node->fFirstPosSet); + } + } + if (U_FAILURE(*fStatus)) { + return; + } - // Iteratate over all leaf nodes, - // int32_t endNodeIx; int32_t startNodeIx; @@ -455,8 +479,8 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) { // Now iterate over the nodes that can start a match, looking for ones // with the same char class as our ending node. RBBINode *startNode; - for (startNodeIx = 0; startNodeIxsize(); startNodeIx++) { - startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx); + for (startNodeIx = 0; startNodeIxfType != RBBINode::leafChar) { continue; } @@ -1032,6 +1056,8 @@ void RBBITableBuilder::printPosSets(RBBINode *n) { if (n==NULL) { return; } + printf("\n"); + RBBINode::printNodeHeader(); n->printNode(); RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"TRUE":"FALSE"); @@ -1141,8 +1167,8 @@ void RBBITableBuilder::exportTable(void *where) { void RBBITableBuilder::printSet(UVector *s) { int32_t i; for (i=0; isize(); i++) { - void *v = s->elementAt(i); - RBBIDebugPrintf("%10p", v); + const RBBINode *v = static_cast(s->elementAt(i)); + RBBIDebugPrintf("%5d", v==NULL? -1 : v->fSerialNum); } RBBIDebugPrintf("\n"); } diff --git a/icu4c/source/common/rbbitblb.h b/icu4c/source/common/rbbitblb.h index 3805b6752a3..9e65bd93d01 100644 --- a/icu4c/source/common/rbbitblb.h +++ b/icu4c/source/common/rbbitblb.h @@ -4,7 +4,7 @@ /* ********************************************************************** -* Copyright (c) 2002-2005, International Business Machines +* Copyright (c) 2002-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -58,6 +58,8 @@ private: void flagTaggedStates(); void mergeRuleStatusVals(); + void addRuleRootNodes(UVector *dest, RBBINode *node); + // Set functions for UVector. // TODO: make a USet subclass of UVector diff --git a/icu4c/source/data/brkitr/rules/char.txt b/icu4c/source/data/brkitr/rules/char.txt index 5164a682792..c53caae0461 100644 --- a/icu4c/source/data/brkitr/rules/char.txt +++ b/icu4c/source/data/brkitr/rules/char.txt @@ -1,12 +1,12 @@ # -# Copyright (C) 2002-2015, International Business Machines Corporation and others. +# Copyright (C) 2002-2016, International Business Machines Corporation and others. # All Rights Reserved. # # file: char.txt # # ICU Character Break Rules, also known as Grapheme Cluster Boundaries # See Unicode Standard Annex #29. -# These rules are based on UAX #29 Revision 20 for Unicode Version 6.2 +# These rules are based on UAX #29 Revision 28 (Draft 3) for Unicode Version 9.0 # # @@ -14,9 +14,9 @@ # $CR = [\p{Grapheme_Cluster_Break = CR}]; $LF = [\p{Grapheme_Cluster_Break = LF}]; -$Control = [\p{Grapheme_Cluster_Break = Control}]; +$Control = [[\p{Grapheme_Cluster_Break = Control}]-[:Block=Tags:]]; # TODO: Restore if the Prepend set becomes non-empty again: $Prepend = [\p{Grapheme_Cluster_Break = Prepend}]; -$Extend = [\p{Grapheme_Cluster_Break = Extend}]; +$Extend = [[\p{Grapheme_Cluster_Break = Extend}][:Block=Tags:]]; $SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}]; $Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}]; @@ -30,10 +30,18 @@ $T = [\p{Grapheme_Cluster_Break = T}]; $LV = [\p{Grapheme_Cluster_Break = LV}]; $LVT = [\p{Grapheme_Cluster_Break = LVT}]; +# Emoji defintions scraped from http://www.unicode.org/Public/emoji/2.0//emoji-data.txt + +$E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918]; + +$E_Modifier = [\U0001F3FB-\U0001F3FF]; + +$ZWJ = [\u200D]; +$GAZ = [\U0001F455-\U0001F469\U0001F48B\U0001F5E8\u2764]; ## ------------------------------------------------- !!chain; - +!!lookAheadHardBreak; !!forward; $CR $LF; @@ -42,13 +50,24 @@ $L ($L | $V | $LV | $LVT); ($LV | $V) ($V | $T); ($LVT | $T) $T; -$Regional_Indicator $Regional_Indicator; +# GB 8. Keep pairs of regional indicators together +# Note that hard break '/' rule triggers only if there are three or more initial RIs, + +^$Regional_Indicator $Regional_Indicator / $Regional_Indicator; +^$Regional_Indicator $Regional_Indicator; -[^$Control $CR $LF] $Extend; +# GB 9 +[^$Control $CR $LF] ($Extend | $ZWJ); +# GB 9a (only for extended grapheme clusters) [^$Control $CR $LF] $SpacingMark; -# TODO: Restore if the Prepend set becomes non-empty again: $Prepend [^$Control $CR $LF]; +# GB 9b Restore if the Prepend set becomes non-empty again: $Prepend [^$Control $CR $LF]; +# GB9c Emoji proposal +($E_Base | $GAZ) $E_Modifier; + +# GB 9d Don't break between ZWJ and Glue_After_Zwj +$ZWJ $GAZ; ## ------------------------------------------------- @@ -58,23 +77,29 @@ $LF $CR; ($V | $T) ($LV | $V); $T ($LVT | $T); -$Regional_Indicator $Regional_Indicator; +# GB 8. Going backwards, we must scan through any number of regional indicators as pairs. +# +$Regional_Indicator $Regional_Indicator / ($Regional_Indicator $Regional_Indicator)* [{eof}[^$Regional_Indicator]]; + +# GB 9 +($Extend | $ZWJ) [^$Control $CR $LF]; #note that this will chain into Regional_Indicator when needed. -$Extend [^$Control $CR $LF]; +# GB 9a $SpacingMark [^$Control $CR $LF]; -# TODO: Restore if the Prepend set becomes non-empty again: [^$Control $CR $LF] $Prepend; +# GB 9b Restore if the Prepend set becomes non-empty again: [^$Control $CR $LF] $Prepend; +# GB 9c +$E_Modifier ($E_Base | $GAZ); + +# GB 9d Don't break between ZWJ and Glue_After_Zwj +$GAZ $ZWJ; ## ------------------------------------------------- -# We don't logically need safe char break rules, but if we don't provide any at all -# the engine for preceding() and following() will fall back to the -# old style inefficient algorithm. !!safe_reverse; -$LF $CR; +$Regional_Indicator $Regional_Indicator; ## ------------------------------------------------- !!safe_forward; -$CR $LF; - +$Regional_Indicator $Regional_Indicator; diff --git a/icu4c/source/data/brkitr/rules/line.txt b/icu4c/source/data/brkitr/rules/line.txt index cfb451e789a..3ed8bd74429 100644 --- a/icu4c/source/data/brkitr/rules/line.txt +++ b/icu4c/source/data/brkitr/rules/line.txt @@ -1,13 +1,16 @@ -# Copyright (c) 2002-2015 International Business Machines Corporation and +# Copyright (c) 2002-2016 International Business Machines Corporation and # others. All Rights Reserved. # # file: line.txt # # Line Breaking Rules -# Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 34 for Unicode 8.0 +# Implement default line breaking as defined by +# Unicode Standard Annex #14 Revision 35 for Unicode 8.0 # http://www.unicode.org/reports/tr14/ # +# Includes the Emoji breaking proposals from Unicode L2/16-011R3. +# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf +# # TODO: Rule LB 8 remains as it was in Unicode 5.2 # This is only because of a limitation of ICU break engine implementation, # not because the older behavior is desirable. @@ -20,8 +23,6 @@ # !!chain; -!!LBCMNoChain; - !!lookAheadHardBreak; # @@ -59,8 +60,13 @@ # See rule LB 19 for an example. # +# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available. + +$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918]; +$EM = [\U0001F3FB-\U0001F3FF]; + $AI = [:LineBreak = Ambiguous:]; -$AL = [:LineBreak = Alphabetic:]; +$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]]; $BA = [:LineBreak = Break_After:]; $BB = [:LineBreak = Break_Before:]; $BK = [:LineBreak = Mandatory_Break:]; @@ -68,7 +74,7 @@ $B2 = [:LineBreak = Break_Both:]; $CB = [:LineBreak = Contingent_Break:]; $CJ = [:LineBreak = Conditional_Japanese_Starter:]; $CL = [:LineBreak = Close_Punctuation:]; -$CM = [:LineBreak = Combining_Mark:]; +$CM = [[:LineBreak = Combining_Mark:] \u200d]; $CP = [:LineBreak = Close_Parenthesis:]; $CR = [:LineBreak = Carriage_Return:]; $EX = [:LineBreak = Exclamation:]; @@ -77,7 +83,7 @@ $HL = [:LineBreak = Hebrew_Letter:]; $HY = [:LineBreak = Hyphen:]; $H2 = [:LineBreak = H2:]; $H3 = [:LineBreak = H3:]; -$ID = [:LineBreak = Ideographic:]; +$ID = [[:LineBreak = Ideographic:][\u2764] - $EB]; $IN = [:LineBreak = Inseperable:]; $IS = [:LineBreak = Infix_Numeric:]; $JL = [:LineBreak = JL:]; @@ -99,6 +105,7 @@ $SY = [:LineBreak = Break_Symbols:]; $WJ = [:LineBreak = Word_Joiner:]; $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; +$ZWJ = [\u200d]; # Dictionary character set, for triggering language-based break engines. Currently # limited to LineBreak=Complex_Context. Note that this set only works in Unicode @@ -131,7 +138,6 @@ $HLcm = $HL $CM*; $HYcm = $HY $CM*; $H2cm = $H2 $CM*; $H3cm = $H3 $CM*; -$IDcm = $ID $CM*; $INcm = $IN $CM*; $IScm = $IS $CM*; $JLcm = $JL $CM*; @@ -160,6 +166,8 @@ $BB $CM+; $B2 $CM+; $CL $CM+; $CP $CM+; +$EB $CM+; +$EM $CM+; $EX $CM+; $GL $CM+; $HL $CM+; @@ -208,7 +216,7 @@ $AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM]; # Rule LB 4, 5 Mandatory (Hard) breaks. # $LB4Breaks = [$BK $CR $LF $NL]; -$LB4NonBreaks = [^$BK $CR $LF $NL]; +$LB4NonBreaks = [^$BK $CR $LF $NL $CM]; $CR $LF {100}; # @@ -216,13 +224,13 @@ $CR $LF {100}; # $LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks. $CAN_CM $CM* $LB4Breaks {100}; -$CM+ $LB4Breaks {100}; +^$CM+ $LB4Breaks {100}; # LB 7 x SP # x ZW $LB4NonBreaks [$SP $ZW]; $CAN_CM $CM* [$SP $ZW]; -$CM+ [$SP $ZW]; +^$CM+ [$SP $ZW]; # # LB 8 Break after zero width space @@ -233,20 +241,23 @@ $CM+ [$SP $ZW]; $LB8Breaks = [$LB4Breaks $ZW]; $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; +# LB 8a ZWJ x ID Emoji proposal. +# +$ZWJ ($ID | $EB | $EM); -# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL -# $CM not covered by the above needs to behave like $AL +# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL +# $CM not covered by the above needs to behave like $AL # See definition of $CAN_CM. $CAN_CM $CM+; # Stick together any combining sequences that don't match other rules. -$CM+; +^$CM+; # # LB 11 Do not break before or after WORD JOINER & related characters. # $CAN_CM $CM* $WJcm; $LB8NonBreaks $WJcm; -$CM+ $WJcm; +^$CM+ $WJcm; $WJcm $CANT_CM; $WJcm $CAN_CM $CM*; @@ -257,13 +268,13 @@ $WJcm $CAN_CM $CM*; # $GLcm $CAN_CM $CM*; $GLcm $CANT_CM; - + # # LB 12a Do not break before NBSP and related characters ... # [^SP BA HY] x GL # [[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GLcm; -$CM+ GLcm; +^$CM+ $GLcm; @@ -272,23 +283,23 @@ $CM+ GLcm; # $LB8NonBreaks $CL; $CAN_CM $CM* $CL; -$CM+ $CL; # by rule 10, stand-alone CM behaves as AL +^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $CP; $CAN_CM $CM* $CP; -$CM+ $CP; # by rule 10, stand-alone CM behaves as AL +^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $EX; $CAN_CM $CM* $EX; -$CM+ $EX; # by rule 10, stand-alone CM behaves as AL +^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $IS; $CAN_CM $CM* $IS; -$CM+ $IS; # by rule 10, stand-alone CM behaves as AL +^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $SY; $CAN_CM $CM* $SY; -$CM+ $SY; # by rule 10, stand-alone CM behaves as AL +^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL # @@ -318,12 +329,10 @@ $LB18Breaks = [$LB8Breaks $SP]; # LB 19 # x QU $LB18NonBreaks $CM* $QUcm; -$CM+ $QUcm; +^$CM+ $QUcm; # QU x $QUcm .?; -$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc. - # TODO: I don't think this rule is needed. # LB 20 @@ -335,14 +344,15 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # LB 21 x (BA | HY | NS) # BB x # -$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); +$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); +^$CM+ ($BAcm | $HYcm | $NScm); $BBcm [^$CB]; # $BB x $BBcm $LB20NonBreaks $CM*; # LB 21a Don't break after Hebrew + Hyphen # HL (HY | BA) x -# +# $HLcm ($HYcm | $BAcm) [^$CB]?; # LB 21b (forward) Don't break between SY and HL @@ -351,25 +361,25 @@ $SYcm $HLcm; # LB 22 ($ALcm | $HLcm) $INcm; -$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL +^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL $EXcm $INcm; -$IDcm $INcm; +($ID | $EB | $EM) $CM* $INcm; $INcm $INcm; $NUcm $INcm; # $LB 23 -$IDcm $POcm; +($ID | $EB | $EM) $CM* $POcm; $ALcm $NUcm; # includes $LB19 $HLcm $NUcm; -$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL +^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL $NUcm $ALcm; $NUcm $HLcm; # # LB 24 # -$PRcm $IDcm; +$PRcm ($ID | $EB | $EM); $PRcm ($ALcm | $HLcm); $POcm ($ALcm | $HLcm); @@ -393,18 +403,27 @@ $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm); # LB 28 Do not break between alphabetics # ($ALcm | $HLcm) ($ALcm | $HLcm); -$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL +^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL # LB 29 $IScm ($ALcm | $HLcm); # LB 30 ($ALcm | $HLcm | $NUcm) $OPcm; -$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL. +^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL. $CPcm ($ALcm | $HLcm | $NUcm); -# LB 30a Do not break between regional indicators. -$RIcm $RIcm; +# LB 30a Do not break between regional indicators. Break after pairs of them. +# Tricky interaction with LB8a: ZWJ x ID +$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}]; +$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}]; +$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}]; + +$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}]; +$RIcm $RIcm $ZWJ ($ID | $EB | $EM); + +# LB 30b Do not break between an Emoji Base and an Emoji Modifier +$EB $CM* $EM; # # Reverse Rules. @@ -413,34 +432,36 @@ $RIcm $RIcm; !!reverse; -$CM+ $ALPlus; -$CM+ $BA; -$CM+ $BB; -$CM+ $B2; -$CM+ $CL; -$CM+ $CP; -$CM+ $EX; -$CM+ $GL; -$CM+ $HL; -$CM+ $HY; -$CM+ $H2; -$CM+ $H3; -$CM+ $ID; -$CM+ $IN; -$CM+ $IS; -$CM+ $JL; -$CM+ $JV; -$CM+ $JT; -$CM+ $NS; -$CM+ $NU; -$CM+ $OP; -$CM+ $PO; -$CM+ $PR; -$CM+ $QU; -$CM+ $RI; -$CM+ $SY; -$CM+ $WJ; -$CM+; +^$CM+ $ALPlus; +^$CM+ $BA; +^$CM+ $BB; +^$CM+ $B2; +^$CM+ $CL; +^$CM+ $CP; +^$CM+ $EB; +^$CM+ $EM; +^$CM+ $EX; +^$CM+ $GL; +^$CM+ $HL; +^$CM+ $HY; +^$CM+ $H2; +^$CM+ $H3; +^$CM+ $ID; +^$CM+ $IN; +^$CM+ $IS; +^$CM+ $JL; +^$CM+ $JV; +^$CM+ $JT; +^$CM+ $NS; +^$CM+ $NU; +^$CM+ $OP; +^$CM+ $PO; +^$CM+ $PR; +^$CM+ $QU; +^$CM+ $RI; +^$CM+ $SY; +^$CM+ $WJ; +^$CM+; # @@ -452,14 +473,14 @@ $AL_FOLLOW $CM+ / ( [$BK $CR $LF $NL $ZW {eof}] | $SP+ $CM+ $SP | $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break. - # LB14 says OP SP* x . + # LB14 says OP SP* x . # becomes OP SP* x AL # becomes OP SP* x CM+ AL_FOLLOW # # Further note: the $AL in [$AL {eof}] is only to work around # a rule compiler bug which complains about # empty sets otherwise. - + # # Sequences of the form (shown forwards) # [CANT_CM] [CM] [PR] @@ -471,7 +492,7 @@ $AL_FOLLOW $CM+ / ( -# LB 4, 5, 5 +# LB 4, 5, 6 $LB4Breaks [$LB4NonBreaks-$CM]; $LB4Breaks $CM+ $CAN_CM; @@ -488,30 +509,37 @@ $LF $CR; # Requires an engine enhancement. # / $SP* $ZW +# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3 +# The ZWJ will look like a CM to whatever precedes it. +# +($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?; + + # LB 9,10 Combining marks. # X $CM needs to behave like X, where X is not $SP or controls. # $CM not covered by the above needs to behave like $AL # Stick together any combining sequences that don't match other rules. -$CM+ $CAN_CM; +^$CM+ $CAN_CM; # LB 11 -$CM* $WJ $CM* $CAN_CM; -$CM* $WJ [$LB8NonBreaks-$CM]; +# +$WJ $CM* $CAN_CM; +$WJ [$LB8NonBreaks-$CM]; $CANT_CM $CM* $WJ; -$CM* $CAN_CM $CM* $WJ; +$CAN_CM $CM* $WJ; # LB 12a # [^SP BA HY] x GL # -$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]]; +$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]]; # LB 12 # GL x # $CANT_CM $CM* $GL; -$CM* $CAN_CM $CM* $GL; +$CAN_CM $CM* $GL; # LB 13 @@ -532,28 +560,26 @@ $SY [$LB8NonBreaks-$CM]; # OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY) # This really wants to chain at the $CM+ (which is acting as an $AL) # except for $CM chaining being disabled. -[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP; +[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP; # LB 14 OP SP* x # -$CM* $CAN_CM $SP* $CM* $OP; +$CAN_CM $SP* $CM* $OP; $CANT_CM $SP* $CM* $OP; $AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP - - $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP; -$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP; -$SY $CM $SP+ $OP; # TODO: Experiment. Remove. + $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP; +$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP; # LB 15 -$CM* $OP $SP* $CM* $QU; +$OP $SP* $CM* $QU; # LB 16 -$CM* $NS $SP* $CM* ($CL | $CP); +$NS $SP* $CM* ($CL | $CP); # LB 17 -$CM* $B2 $SP* $CM* $B2; +$B2 $SP* $CM* $B2; # LB 18 break after spaces # Nothing explicit needed here. @@ -562,82 +588,100 @@ $CM* $B2 $SP* $CM* $B2; # # LB 19 # -$CM* $QU $CM* $CAN_CM; # . x QU -$CM* $QU $LB18NonBreaks; +$QU $CM* $CAN_CM; # . x QU +$QU $LB18NonBreaks; -$CM* $CAN_CM $CM* $QU; # QU x . +$CAN_CM $CM* $QU; # QU x . $CANT_CM $CM* $QU; - + # # LB 20 Break before and after CB. # nothing needed here. # # LB 21 -$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) +($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) -$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x . -[^$CB] $CM* $BB; # +[$LB20NonBreaks-$CM] $CM* $BB; # BB x . +[^$CB] $CM* $BB; # # LB21a [^$CB] $CM* ($HY | $BA) $CM* $HL; # LB21b (reverse) -$CM* $HL $CM* $SY; +$HL $CM* $SY; # LB 22 -$CM* $IN $CM* ($ALPlus | $HL); -$CM* $IN $CM* $EX; -$CM* $IN $CM* $ID; -$CM* $IN $CM* $IN; -$CM* $IN $CM* $NU; +$IN $CM* ($ALPlus | $HL); +$IN $CM* $EX; +$IN $CM* ($ID | $EB | $EM); +$IN $CM* $IN; +$IN $CM* $NU; # LB 23 -$CM* $PO $CM* $ID; -$CM* $NU $CM* ($ALPlus | $HL); -$CM* ($ALPlus | $HL) $CM* $NU; +$PO $CM* ($ID | $EB | $EM); +$NU $CM* ($ALPlus | $HL); +($ALPlus | $HL) $CM* $NU; # LB 24 -$CM* $ID $CM* $PR; -$CM* ($ALPlus | $HL) $CM* $PR; -$CM* ($ALPlus | $HL) $CM* $PO; +($ID | $EB | $EM) $CM* $PR; +($ALPlus | $HL) $CM* $PR; +($ALPlus | $HL) $CM* $PO; # LB 25 ($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?; # LB 26 -$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL; -$CM* ($JT | $JV) $CM* ($H2 | $JV); -$CM* $JT $CM* ($H3 | $JT); +($H3 | $H2 | $JV | $JL) $CM* $JL; +($JT | $JV) $CM* ($H2 | $JV); +$JT $CM* ($H3 | $JT); # LB 27 -$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL); -$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL); -$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; +$IN $CM* ($H3 | $H2 | $JT | $JV | $JL); +$PO $CM* ($H3 | $H2 | $JT | $JV | $JL); + ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; # LB 28 -$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL); +($ALPlus | $HL) $CM* ($ALPlus | $HL); # LB 29 -$CM* ($ALPlus | $HL) $CM* $IS; +($ALPlus | $HL) $CM* $IS; # LB 30 -$CM* $OP $CM* ($ALPlus | $HL | $NU); -$CM* ($ALPlus | $HL | $NU) $CM* $CP; +$OP $CM* ($ALPlus | $HL | $NU); +($ALPlus | $HL | $NU) $CM* $CP; # LB 30a -$CM* $RI $CM* $RI; +# Pairs of Regional Indicators. +# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs, +# the second with an even number. Stripping away the cruft they look like +# [^RI] RI / (RI RI)+ ^RI; +# [^RI] RI RI / (RI RI)+ ^RI; +# +[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; +[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; + +# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs. +$RI $CM* $RI; + +# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI". +$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL)); + + +# LB 30b Do not break between an Emoji Base and an Emoji Modifier +$EM $CM* $EB; + ## ------------------------------------------------- !!safe_reverse; # LB 9 -$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; -$CM+ $SP / .; +^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; +^$CM+ $SP / .; # LB 14 $SP+ $CM* $OP; @@ -658,6 +702,9 @@ $CM* ($HY | $BA) $CM* $HL; ($CM* ($IS | $SY))+ $CM* $NU; ($CL | $CP) $CM* ($NU | $IS | $SY); +# LB 30 +($CM* $RI)+; + # For dictionary-based break $dictionary $dictionary; @@ -674,6 +721,6 @@ $dictionary $dictionary; # turn off rule chaining. We don't want to move more # than necessary. # -[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary]; +^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary]; $dictionary $dictionary; diff --git a/icu4c/source/data/brkitr/rules/line_fi.txt b/icu4c/source/data/brkitr/rules/line_fi.txt index 0f104751209..b0dd906347b 100644 --- a/icu4c/source/data/brkitr/rules/line_fi.txt +++ b/icu4c/source/data/brkitr/rules/line_fi.txt @@ -1,14 +1,17 @@ -# Copyright (c) 2002-2015 International Business Machines Corporation and +# Copyright (c) 2002-2016 International Business Machines Corporation and # others. All Rights Reserved. # # file: line_fi.txt # # Line Breaking Rules -# Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 34 for Unicode 8.0 +# Implement default line breaking as defined by +# Unicode Standard Annex #14 Revision 35 for Unicode 8.0 # http://www.unicode.org/reports/tr14/ # tailored as noted in 2nd paragraph below.. # +# Includes the Emoji breaking proposals from Unicode L2/16-011R3. +# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf +# # TODO: Rule LB 8 remains as it was in Unicode 5.2 # This is only because of a limitation of ICU break engine implementation, # not because the older behavior is desirable. @@ -22,8 +25,6 @@ # !!chain; -!!LBCMNoChain; - !!lookAheadHardBreak; # @@ -61,9 +62,14 @@ # See rule LB 19 for an example. # +# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available. + +$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918]; +$EM = [\U0001F3FB-\U0001F3FF]; + $AI = [:LineBreak = Ambiguous:]; -$AL = [:LineBreak = Alphabetic:]; -$BA = [[:LineBreak = Break_After:] - [\u2010]]; +$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]]; +$BA = [:LineBreak = Break_After:]; $HH = [\u2010]; $BB = [:LineBreak = Break_Before:]; $BK = [:LineBreak = Mandatory_Break:]; @@ -71,7 +77,7 @@ $B2 = [:LineBreak = Break_Both:]; $CB = [:LineBreak = Contingent_Break:]; $CJ = [:LineBreak = Conditional_Japanese_Starter:]; $CL = [:LineBreak = Close_Punctuation:]; -$CM = [:LineBreak = Combining_Mark:]; +$CM = [[:LineBreak = Combining_Mark:] \u200d]; $CP = [:LineBreak = Close_Parenthesis:]; $CR = [:LineBreak = Carriage_Return:]; $EX = [:LineBreak = Exclamation:]; @@ -80,7 +86,7 @@ $HL = [:LineBreak = Hebrew_Letter:]; $HY = [:LineBreak = Hyphen:]; $H2 = [:LineBreak = H2:]; $H3 = [:LineBreak = H3:]; -$ID = [:LineBreak = Ideographic:]; +$ID = [[:LineBreak = Ideographic:][\u2764] - $EB]; $IN = [:LineBreak = Inseperable:]; $IS = [:LineBreak = Infix_Numeric:]; $JL = [:LineBreak = JL:]; @@ -102,6 +108,7 @@ $SY = [:LineBreak = Break_Symbols:]; $WJ = [:LineBreak = Word_Joiner:]; $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; +$ZWJ = [\u200d]; # Dictionary character set, for triggering language-based break engines. Currently # limited to LineBreak=Complex_Context. Note that this set only works in Unicode @@ -135,7 +142,6 @@ $HLcm = $HL $CM*; $HYcm = $HY $CM*; $H2cm = $H2 $CM*; $H3cm = $H3 $CM*; -$IDcm = $ID $CM*; $INcm = $IN $CM*; $IScm = $IS $CM*; $JLcm = $JL $CM*; @@ -165,6 +171,8 @@ $BB $CM+; $B2 $CM+; $CL $CM+; $CP $CM+; +$EB $CM+; +$EM $CM+; $EX $CM+; $GL $CM+; $HL $CM+; @@ -213,7 +221,7 @@ $AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM]; # Rule LB 4, 5 Mandatory (Hard) breaks. # $LB4Breaks = [$BK $CR $LF $NL]; -$LB4NonBreaks = [^$BK $CR $LF $NL]; +$LB4NonBreaks = [^$BK $CR $LF $NL $CM]; $CR $LF {100}; # @@ -221,13 +229,13 @@ $CR $LF {100}; # $LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks. $CAN_CM $CM* $LB4Breaks {100}; -$CM+ $LB4Breaks {100}; +^$CM+ $LB4Breaks {100}; # LB 7 x SP # x ZW $LB4NonBreaks [$SP $ZW]; $CAN_CM $CM* [$SP $ZW]; -$CM+ [$SP $ZW]; +^$CM+ [$SP $ZW]; # # LB 8 Break after zero width space @@ -238,20 +246,23 @@ $CM+ [$SP $ZW]; $LB8Breaks = [$LB4Breaks $ZW]; $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; +# LB 8a ZWJ x ID Emoji proposal. +# +$ZWJ ($ID | $EB | $EM); -# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL -# $CM not covered by the above needs to behave like $AL +# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL +# $CM not covered by the above needs to behave like $AL # See definition of $CAN_CM. $CAN_CM $CM+; # Stick together any combining sequences that don't match other rules. -$CM+; +^$CM+; # # LB 11 Do not break before or after WORD JOINER & related characters. # $CAN_CM $CM* $WJcm; $LB8NonBreaks $WJcm; -$CM+ $WJcm; +^$CM+ $WJcm; $WJcm $CANT_CM; $WJcm $CAN_CM $CM*; @@ -262,13 +273,13 @@ $WJcm $CAN_CM $CM*; # $GLcm $CAN_CM $CM*; $GLcm $CANT_CM; - + # # LB 12a Do not break before NBSP and related characters ... # [^SP BA HY] x GL # [[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GLcm; -$CM+ GLcm; +^$CM+ $GLcm; @@ -277,23 +288,23 @@ $CM+ GLcm; # $LB8NonBreaks $CL; $CAN_CM $CM* $CL; -$CM+ $CL; # by rule 10, stand-alone CM behaves as AL +^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $CP; $CAN_CM $CM* $CP; -$CM+ $CP; # by rule 10, stand-alone CM behaves as AL +^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $EX; $CAN_CM $CM* $EX; -$CM+ $EX; # by rule 10, stand-alone CM behaves as AL +^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $IS; $CAN_CM $CM* $IS; -$CM+ $IS; # by rule 10, stand-alone CM behaves as AL +^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $SY; $CAN_CM $CM* $SY; -$CM+ $SY; # by rule 10, stand-alone CM behaves as AL +^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL # @@ -323,12 +334,10 @@ $LB18Breaks = [$LB8Breaks $SP]; # LB 19 # x QU $LB18NonBreaks $CM* $QUcm; -$CM+ $QUcm; +^$CM+ $QUcm; # QU x $QUcm .?; -$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc. - # TODO: I don't think this rule is needed. # LB 20 @@ -344,6 +353,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; $LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm) / $AL; $LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm); ($HY | $HH) $AL; +^$CM+ ($BAcm | $HYcm | $HHcm | $NScm); $BBcm [^$CB]; # $BB x $BBcm $LB20NonBreaks $CM*; @@ -359,25 +369,25 @@ $SYcm $HLcm; # LB 22 ($ALcm | $HLcm) $INcm; -$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL +^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL $EXcm $INcm; -$IDcm $INcm; +($ID | $EB | $EM) $CM* $INcm; $INcm $INcm; $NUcm $INcm; # $LB 23 -$IDcm $POcm; +($ID | $EB | $EM) $CM* $POcm; $ALcm $NUcm; # includes $LB19 $HLcm $NUcm; -$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL +^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL $NUcm $ALcm; $NUcm $HLcm; # # LB 24 # -$PRcm $IDcm; +$PRcm ($ID | $EB | $EM); $PRcm ($ALcm | $HLcm); $POcm ($ALcm | $HLcm); @@ -401,18 +411,27 @@ $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm); # LB 28 Do not break between alphabetics # ($ALcm | $HLcm) ($ALcm | $HLcm); -$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL +^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL # LB 29 $IScm ($ALcm | $HLcm); # LB 30 ($ALcm | $HLcm | $NUcm) $OPcm; -$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL. +^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL. $CPcm ($ALcm | $HLcm | $NUcm); -# LB 30a Do not break between regional indicators. -$RIcm $RIcm; +# LB 30a Do not break between regional indicators. Break after pairs of them. +# Tricky interaction with LB8a: ZWJ x ID +$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}]; +$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}]; +$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}]; + +$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}]; +$RIcm $RIcm $ZWJ ($ID | $EB | $EM); + +# LB 30b Do not break between an Emoji Base and an Emoji Modifier +$EB $CM* $EM; # # Reverse Rules. @@ -421,35 +440,37 @@ $RIcm $RIcm; !!reverse; -$CM+ $ALPlus; -$CM+ $BA; -$CM+ $HH; -$CM+ $BB; -$CM+ $B2; -$CM+ $CL; -$CM+ $CP; -$CM+ $EX; -$CM+ $GL; -$CM+ $HL; -$CM+ $HY; -$CM+ $H2; -$CM+ $H3; -$CM+ $ID; -$CM+ $IN; -$CM+ $IS; -$CM+ $JL; -$CM+ $JV; -$CM+ $JT; -$CM+ $NS; -$CM+ $NU; -$CM+ $OP; -$CM+ $PO; -$CM+ $PR; -$CM+ $QU; -$CM+ $RI; -$CM+ $SY; -$CM+ $WJ; -$CM+; +^$CM+ $ALPlus; +^$CM+ $BA; +^$CM+ $HH; +^$CM+ $BB; +^$CM+ $B2; +^$CM+ $CL; +^$CM+ $CP; +^$CM+ $EB; +^$CM+ $EM; +^$CM+ $EX; +^$CM+ $GL; +^$CM+ $HL; +^$CM+ $HY; +^$CM+ $H2; +^$CM+ $H3; +^$CM+ $ID; +^$CM+ $IN; +^$CM+ $IS; +^$CM+ $JL; +^$CM+ $JV; +^$CM+ $JT; +^$CM+ $NS; +^$CM+ $NU; +^$CM+ $OP; +^$CM+ $PO; +^$CM+ $PR; +^$CM+ $QU; +^$CM+ $RI; +^$CM+ $SY; +^$CM+ $WJ; +^$CM+; # @@ -461,14 +482,14 @@ $AL_FOLLOW $CM+ / ( [$BK $CR $LF $NL $ZW {eof}] | $SP+ $CM+ $SP | $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break. - # LB14 says OP SP* x . + # LB14 says OP SP* x . # becomes OP SP* x AL # becomes OP SP* x CM+ AL_FOLLOW # # Further note: the $AL in [$AL {eof}] is only to work around # a rule compiler bug which complains about # empty sets otherwise. - + # # Sequences of the form (shown forwards) # [CANT_CM] [CM] [PR] @@ -480,7 +501,7 @@ $AL_FOLLOW $CM+ / ( -# LB 4, 5, 5 +# LB 4, 5, 6 $LB4Breaks [$LB4NonBreaks-$CM]; $LB4Breaks $CM+ $CAN_CM; @@ -497,30 +518,37 @@ $LF $CR; # Requires an engine enhancement. # / $SP* $ZW +# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3 +# The ZWJ will look like a CM to whatever precedes it. +# +($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?; + + # LB 9,10 Combining marks. # X $CM needs to behave like X, where X is not $SP or controls. # $CM not covered by the above needs to behave like $AL # Stick together any combining sequences that don't match other rules. -$CM+ $CAN_CM; +^$CM+ $CAN_CM; # LB 11 -$CM* $WJ $CM* $CAN_CM; -$CM* $WJ [$LB8NonBreaks-$CM]; +# +$WJ $CM* $CAN_CM; +$WJ [$LB8NonBreaks-$CM]; $CANT_CM $CM* $WJ; -$CM* $CAN_CM $CM* $WJ; +$CAN_CM $CM* $WJ; # LB 12a # [^SP BA HY] x GL # -$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]]; +$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]]; # LB 12 # GL x # $CANT_CM $CM* $GL; -$CM* $CAN_CM $CM* $GL; +$CAN_CM $CM* $GL; # LB 13 @@ -541,28 +569,26 @@ $SY [$LB8NonBreaks-$CM]; # OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY) # This really wants to chain at the $CM+ (which is acting as an $AL) # except for $CM chaining being disabled. -[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP; +[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP; # LB 14 OP SP* x # -$CM* $CAN_CM $SP* $CM* $OP; +$CAN_CM $SP* $CM* $OP; $CANT_CM $SP* $CM* $OP; $AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP - - $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP; -$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP; -$SY $CM $SP+ $OP; # TODO: Experiment. Remove. + $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP; +$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP; # LB 15 -$CM* $OP $SP* $CM* $QU; +$OP $SP* $CM* $QU; # LB 16 -$CM* $NS $SP* $CM* ($CL | $CP); +$NS $SP* $CM* ($CL | $CP); # LB 17 -$CM* $B2 $SP* $CM* $B2; +$B2 $SP* $CM* $B2; # LB 18 break after spaces # Nothing explicit needed here. @@ -571,13 +597,13 @@ $CM* $B2 $SP* $CM* $B2; # # LB 19 # -$CM* $QU $CM* $CAN_CM; # . x QU -$CM* $QU $LB18NonBreaks; +$QU $CM* $CAN_CM; # . x QU +$QU $LB18NonBreaks; -$CM* $CAN_CM $CM* $QU; # QU x . +$CAN_CM $CM* $QU; # QU x . $CANT_CM $CM* $QU; - + # # LB 20 Break before and after CB. # nothing needed here. @@ -587,69 +613,87 @@ $CM* $CAN_CM $CM* $QU; # QU x . $AL ($HY | $HH) / $SP; # LB 21 -$CM* ($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) +($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) -$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x . -[^$CB] $CM* $BB; # +[$LB20NonBreaks-$CM] $CM* $BB; # BB x . +[^$CB] $CM* $BB; # # LB21a [^$CB] $CM* ($HY | $BA | $HH) $CM* $HL; # LB21b (reverse) -$CM* $HL $CM* $SY; +$HL $CM* $SY; # LB 22 -$CM* $IN $CM* ($ALPlus | $HL); -$CM* $IN $CM* $EX; -$CM* $IN $CM* $ID; -$CM* $IN $CM* $IN; -$CM* $IN $CM* $NU; +$IN $CM* ($ALPlus | $HL); +$IN $CM* $EX; +$IN $CM* ($ID | $EB | $EM); +$IN $CM* $IN; +$IN $CM* $NU; # LB 23 -$CM* $PO $CM* $ID; -$CM* $NU $CM* ($ALPlus | $HL); -$CM* ($ALPlus | $HL) $CM* $NU; +$PO $CM* ($ID | $EB | $EM); +$NU $CM* ($ALPlus | $HL); +($ALPlus | $HL) $CM* $NU; # LB 24 -$CM* $ID $CM* $PR; -$CM* ($ALPlus | $HL) $CM* $PR; -$CM* ($ALPlus | $HL) $CM* $PO; +($ID | $EB | $EM) $CM* $PR; +($ALPlus | $HL) $CM* $PR; +($ALPlus | $HL) $CM* $PO; # LB 25 ($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?; # LB 26 -$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL; -$CM* ($JT | $JV) $CM* ($H2 | $JV); -$CM* $JT $CM* ($H3 | $JT); +($H3 | $H2 | $JV | $JL) $CM* $JL; +($JT | $JV) $CM* ($H2 | $JV); +$JT $CM* ($H3 | $JT); # LB 27 -$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL); -$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL); -$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; +$IN $CM* ($H3 | $H2 | $JT | $JV | $JL); +$PO $CM* ($H3 | $H2 | $JT | $JV | $JL); + ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; # LB 28 -$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL); +($ALPlus | $HL) $CM* ($ALPlus | $HL); # LB 29 -$CM* ($ALPlus | $HL) $CM* $IS; +($ALPlus | $HL) $CM* $IS; # LB 30 -$CM* $OP $CM* ($ALPlus | $HL | $NU); -$CM* ($ALPlus | $HL | $NU) $CM* $CP; +$OP $CM* ($ALPlus | $HL | $NU); +($ALPlus | $HL | $NU) $CM* $CP; # LB 30a -$CM* $RI $CM* $RI; +# Pairs of Regional Indicators. +# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs, +# the second with an even number. Stripping away the cruft they look like +# [^RI] RI / (RI RI)+ ^RI; +# [^RI] RI RI / (RI RI)+ ^RI; +# +[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; +[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; + +# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs. +$RI $CM* $RI; + +# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI". +$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL)); + + +# LB 30b Do not break between an Emoji Base and an Emoji Modifier +$EM $CM* $EB; + ## ------------------------------------------------- !!safe_reverse; # LB 9 -$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; -$CM+ $SP / .; +^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; +^$CM+ $SP / .; # LB 14 $SP+ $CM* $OP; @@ -670,6 +714,9 @@ $CM* ($HY | $BA | $HH) $CM* $HL; ($CM* ($IS | $SY))+ $CM* $NU; ($CL | $CP) $CM* ($NU | $IS | $SY); +# LB 30 +($CM* $RI)+; + # For dictionary-based break $dictionary $dictionary; @@ -686,6 +733,6 @@ $dictionary $dictionary; # turn off rule chaining. We don't want to move more # than necessary. # -[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary]; +^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary]; $dictionary $dictionary; diff --git a/icu4c/source/data/brkitr/rules/line_loose.txt b/icu4c/source/data/brkitr/rules/line_loose.txt index 2732b2b9ab7..f91b7b6f025 100644 --- a/icu4c/source/data/brkitr/rules/line_loose.txt +++ b/icu4c/source/data/brkitr/rules/line_loose.txt @@ -1,13 +1,17 @@ -# Copyright (c) 2002-2015 International Business Machines Corporation and +# Copyright (c) 2002-2016 International Business Machines Corporation and # others. All Rights Reserved. # # file: line_loose.txt # # Line Breaking Rules -# Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 34 for Unicode 8.0 +# Implement default line breaking as defined by +# Unicode Standard Annex #14 Revision 35 for Unicode 8.0 # http://www.unicode.org/reports/tr14/ -# tailored as noted in 2nd paragraph below.. +# +# Includes the Emoji breaking proposals from Unicode L2/16-011R3. +# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf +# +# tailored as noted in 2nd paragraph below. # # TODO: Rule LB 8 remains as it was in Unicode 5.2 # This is only because of a limitation of ICU break engine implementation, @@ -26,8 +30,6 @@ # !!chain; -!!LBCMNoChain; - !!lookAheadHardBreak; # @@ -65,8 +67,13 @@ # See rule LB 19 for an example. # +# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available. + +$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918]; +$EM = [\U0001F3FB-\U0001F3FF]; + $AI = [:LineBreak = Ambiguous:]; -$AL = [:LineBreak = Alphabetic:]; +$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]]; $BA = [:LineBreak = Break_After:]; $BB = [:LineBreak = Break_Before:]; $BK = [:LineBreak = Mandatory_Break:]; @@ -74,7 +81,7 @@ $B2 = [:LineBreak = Break_Both:]; $CB = [:LineBreak = Contingent_Break:]; $CJ = [:LineBreak = Conditional_Japanese_Starter:]; $CL = [:LineBreak = Close_Punctuation:]; -$CM = [:LineBreak = Combining_Mark:]; +$CM = [[:LineBreak = Combining_Mark:] \u200d]; $CP = [:LineBreak = Close_Parenthesis:]; $CR = [:LineBreak = Carriage_Return:]; $EX = [:LineBreak = Exclamation:]; @@ -83,7 +90,7 @@ $HL = [:LineBreak = Hebrew_Letter:]; $HY = [:LineBreak = Hyphen:]; $H2 = [:LineBreak = H2:]; $H3 = [:LineBreak = H3:]; -$ID = [[:LineBreak = Ideographic:] $CJ]; +$ID = [[:LineBreak = Ideographic:]$CJ[\u2764] - $EB]; $IN = [:LineBreak = Inseperable:]; $IS = [:LineBreak = Infix_Numeric:]; $JL = [:LineBreak = JL:]; @@ -106,6 +113,7 @@ $SY = [:LineBreak = Break_Symbols:]; $WJ = [:LineBreak = Word_Joiner:]; $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; +$ZWJ = [\u200d]; # Dictionary character set, for triggering language-based break engines. Currently # limited to LineBreak=Complex_Context. Note that this set only works in Unicode @@ -138,7 +146,6 @@ $HLcm = $HL $CM*; $HYcm = $HY $CM*; $H2cm = $H2 $CM*; $H3cm = $H3 $CM*; -$IDcm = $ID $CM*; $INcm = $IN $CM*; $IScm = $IS $CM*; $JLcm = $JL $CM*; @@ -168,6 +175,8 @@ $BB $CM+; $B2 $CM+; $CL $CM+; $CP $CM+; +$EB $CM+; +$EM $CM+; $EX $CM+; $GL $CM+; $HL $CM+; @@ -217,7 +226,7 @@ $AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM]; # Rule LB 4, 5 Mandatory (Hard) breaks. # $LB4Breaks = [$BK $CR $LF $NL]; -$LB4NonBreaks = [^$BK $CR $LF $NL]; +$LB4NonBreaks = [^$BK $CR $LF $NL $CM]; $CR $LF {100}; # @@ -225,13 +234,13 @@ $CR $LF {100}; # $LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks. $CAN_CM $CM* $LB4Breaks {100}; -$CM+ $LB4Breaks {100}; +^$CM+ $LB4Breaks {100}; # LB 7 x SP # x ZW $LB4NonBreaks [$SP $ZW]; $CAN_CM $CM* [$SP $ZW]; -$CM+ [$SP $ZW]; +^$CM+ [$SP $ZW]; # # LB 8 Break after zero width space @@ -242,20 +251,23 @@ $CM+ [$SP $ZW]; $LB8Breaks = [$LB4Breaks $ZW]; $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; +# LB 8a ZWJ x ID Emoji proposal. +# +$ZWJ ($ID | $EB | $EM); -# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL -# $CM not covered by the above needs to behave like $AL +# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL +# $CM not covered by the above needs to behave like $AL # See definition of $CAN_CM. $CAN_CM $CM+; # Stick together any combining sequences that don't match other rules. -$CM+; +^$CM+; # # LB 11 Do not break before or after WORD JOINER & related characters. # $CAN_CM $CM* $WJcm; $LB8NonBreaks $WJcm; -$CM+ $WJcm; +^$CM+ $WJcm; $WJcm $CANT_CM; $WJcm $CAN_CM $CM*; @@ -266,13 +278,13 @@ $WJcm $CAN_CM $CM*; # $GLcm $CAN_CM $CM*; $GLcm $CANT_CM; - + # # LB 12a Do not break before NBSP and related characters ... # [^SP BA HY] x GL # [[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GLcm; -$CM+ GLcm; +^$CM+ $GLcm; @@ -281,23 +293,23 @@ $CM+ GLcm; # $LB8NonBreaks $CL; $CAN_CM $CM* $CL; -$CM+ $CL; # by rule 10, stand-alone CM behaves as AL +^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $CP; $CAN_CM $CM* $CP; -$CM+ $CP; # by rule 10, stand-alone CM behaves as AL +^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $EX; $CAN_CM $CM* $EX; -$CM+ $EX; # by rule 10, stand-alone CM behaves as AL +^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $IS; $CAN_CM $CM* $IS; -$CM+ $IS; # by rule 10, stand-alone CM behaves as AL +^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $SY; $CAN_CM $CM* $SY; -$CM+ $SY; # by rule 10, stand-alone CM behaves as AL +^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL # @@ -329,12 +341,10 @@ $LB18Breaks = [$LB8Breaks $SP]; # LB 19 # x QU $LB18NonBreaks $CM* $QUcm; -$CM+ $QUcm; +^$CM+ $QUcm; # QU x $QUcm .?; -$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc. - # TODO: I don't think this rule is needed. # LB 20 @@ -347,14 +357,15 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # BB x # # DO allow breaks here before NSXcm, so don't include it -$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); +$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); +^$CM+ ($BAcm | $HYcm | $NScm); $BBcm [^$CB]; # $BB x $BBcm $LB20NonBreaks $CM*; # LB 21a Don't break after Hebrew + Hyphen # HL (HY | BA) x -# +# $HLcm ($HYcm | $BAcm) [^$CB]?; # LB 21b (forward) Don't break between SY and HL @@ -363,25 +374,25 @@ $SYcm $HLcm; # LB 22 ($ALcm | $HLcm) $INcm; -$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL +^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL $EXcm $INcm; -$IDcm $INcm; +($ID | $EB | $EM) $CM* $INcm; # $INcm $INcm; # delete this rule for CSS loose $NUcm $INcm; # $LB 23 -$IDcm $POcm; +($ID | $EB | $EM) $CM* $POcm; $ALcm $NUcm; # includes $LB19 $HLcm $NUcm; -$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL +^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL $NUcm $ALcm; $NUcm $HLcm; # # LB 24 # -$PRcm $IDcm; +$PRcm ($ID | $EB | $EM); $PRcm ($ALcm | $HLcm); $POcm ($ALcm | $HLcm); @@ -405,18 +416,27 @@ $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm); # LB 28 Do not break between alphabetics # ($ALcm | $HLcm) ($ALcm | $HLcm); -$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL +^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL # LB 29 $IScm ($ALcm | $HLcm); # LB 30 ($ALcm | $HLcm | $NUcm) $OPcm; -$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL. +^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL. $CPcm ($ALcm | $HLcm | $NUcm); -# LB 30a Do not break between regional indicators. -$RIcm $RIcm; +# LB 30a Do not break between regional indicators. Break after pairs of them. +# Tricky interaction with LB8a: ZWJ x ID +$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}]; +$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}]; +$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}]; + +$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}]; +$RIcm $RIcm $ZWJ ($ID | $EB | $EM); + +# LB 30b Do not break between an Emoji Base and an Emoji Modifier +$EB $CM* $EM; # # Reverse Rules. @@ -425,35 +445,37 @@ $RIcm $RIcm; !!reverse; -$CM+ $ALPlus; -$CM+ $BA; -$CM+ $BB; -$CM+ $B2; -$CM+ $CL; -$CM+ $CP; -$CM+ $EX; -$CM+ $GL; -$CM+ $HL; -$CM+ $HY; -$CM+ $H2; -$CM+ $H3; -$CM+ $ID; -$CM+ $IN; -$CM+ $IS; -$CM+ $JL; -$CM+ $JV; -$CM+ $JT; -$CM+ $NS; -$CM+ $NSX; -$CM+ $NU; -$CM+ $OP; -$CM+ $PO; -$CM+ $PR; -$CM+ $QU; -$CM+ $RI; -$CM+ $SY; -$CM+ $WJ; -$CM+; +^$CM+ $ALPlus; +^$CM+ $BA; +^$CM+ $BB; +^$CM+ $B2; +^$CM+ $CL; +^$CM+ $CP; +^$CM+ $EB; +^$CM+ $EM; +^$CM+ $EX; +^$CM+ $GL; +^$CM+ $HL; +^$CM+ $HY; +^$CM+ $H2; +^$CM+ $H3; +^$CM+ $ID; +^$CM+ $IN; +^$CM+ $IS; +^$CM+ $JL; +^$CM+ $JV; +^$CM+ $JT; +^$CM+ $NS; +^$CM+ $NSX; +^$CM+ $NU; +^$CM+ $OP; +^$CM+ $PO; +^$CM+ $PR; +^$CM+ $QU; +^$CM+ $RI; +^$CM+ $SY; +^$CM+ $WJ; +^$CM+; # @@ -465,14 +487,14 @@ $AL_FOLLOW $CM+ / ( [$BK $CR $LF $NL $ZW {eof}] | $SP+ $CM+ $SP | $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break. - # LB14 says OP SP* x . + # LB14 says OP SP* x . # becomes OP SP* x AL # becomes OP SP* x CM+ AL_FOLLOW # # Further note: the $AL in [$AL {eof}] is only to work around # a rule compiler bug which complains about # empty sets otherwise. - + # # Sequences of the form (shown forwards) # [CANT_CM] [CM] [PR] @@ -484,7 +506,7 @@ $AL_FOLLOW $CM+ / ( -# LB 4, 5, 5 +# LB 4, 5, 6 $LB4Breaks [$LB4NonBreaks-$CM]; $LB4Breaks $CM+ $CAN_CM; @@ -501,30 +523,37 @@ $LF $CR; # Requires an engine enhancement. # / $SP* $ZW +# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3 +# The ZWJ will look like a CM to whatever precedes it. +# +($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?; + + # LB 9,10 Combining marks. # X $CM needs to behave like X, where X is not $SP or controls. # $CM not covered by the above needs to behave like $AL # Stick together any combining sequences that don't match other rules. -$CM+ $CAN_CM; +^$CM+ $CAN_CM; # LB 11 -$CM* $WJ $CM* $CAN_CM; -$CM* $WJ [$LB8NonBreaks-$CM]; +# +$WJ $CM* $CAN_CM; +$WJ [$LB8NonBreaks-$CM]; $CANT_CM $CM* $WJ; -$CM* $CAN_CM $CM* $WJ; +$CAN_CM $CM* $WJ; # LB 12a # [^SP BA HY] x GL # -$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]]; +$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]]; # LB 12 # GL x # $CANT_CM $CM* $GL; -$CM* $CAN_CM $CM* $GL; +$CAN_CM $CM* $GL; # LB 13 @@ -545,29 +574,27 @@ $SY [$LB8NonBreaks-$CM]; # OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY) # This really wants to chain at the $CM+ (which is acting as an $AL) # except for $CM chaining being disabled. -[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP; +[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP; # LB 14 OP SP* x # -$CM* $CAN_CM $SP* $CM* $OP; +$CAN_CM $SP* $CM* $OP; $CANT_CM $SP* $CM* $OP; $AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP - - $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP; -$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP; -$SY $CM $SP+ $OP; # TODO: Experiment. Remove. + $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP; +$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP; # LB 15 -$CM* $OP $SP* $CM* $QU; +$OP $SP* $CM* $QU; # LB 16 # Don't include $NSX here -$CM* $NS $SP* $CM* ($CL | $CP); +$NS $SP* $CM* ($CL | $CP); # LB 17 -$CM* $B2 $SP* $CM* $B2; +$B2 $SP* $CM* $B2; # LB 18 break after spaces # Nothing explicit needed here. @@ -576,13 +603,13 @@ $CM* $B2 $SP* $CM* $B2; # # LB 19 # -$CM* $QU $CM* $CAN_CM; # . x QU -$CM* $QU $LB18NonBreaks; +$QU $CM* $CAN_CM; # . x QU +$QU $LB18NonBreaks; -$CM* $CAN_CM $CM* $QU; # QU x . +$CAN_CM $CM* $QU; # QU x . $CANT_CM $CM* $QU; - + # # LB 20 Break before and after CB. # nothing needed here. @@ -590,69 +617,88 @@ $CM* $CAN_CM $CM* $QU; # QU x . # LB 21 # Don't include $NSX here -$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) +($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) -$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x . -[^$CB] $CM* $BB; # +[$LB20NonBreaks-$CM] $CM* $BB; # BB x . +[^$CB] $CM* $BB; # # LB21a [^$CB] $CM* ($HY | $BA) $CM* $HL; # LB21b (reverse) -$CM* $HL $CM* $SY; +$HL $CM* $SY; # LB 22 -$CM* $IN $CM* ($ALPlus | $HL); -$CM* $IN $CM* $EX; -$CM* $IN $CM* $ID; -# $CM* $IN $CM* $IN; # delete this rule for CSS loose -$CM* $IN $CM* $NU; +$IN $CM* ($ALPlus | $HL); +$IN $CM* $EX; +$IN $CM* ($ID | $EB | $EM); +# $IN $CM* $IN; # delete this rule for CSS loose +$IN $CM* $NU; # LB 23 -$CM* $PO $CM* $ID; -$CM* $NU $CM* ($ALPlus | $HL); -$CM* ($ALPlus | $HL) $CM* $NU; +$PO $CM* ($ID | $EB | $EM); +$NU $CM* ($ALPlus | $HL); +($ALPlus | $HL) $CM* $NU; # LB 24 -$CM* $ID $CM* $PR; -$CM* ($ALPlus | $HL) $CM* $PR; -$CM* ($ALPlus | $HL) $CM* $PO; +($ID | $EB | $EM) $CM* $PR; +($ALPlus | $HL) $CM* $PR; +($ALPlus | $HL) $CM* $PO; # LB 25 ($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?; # LB 26 -$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL; -$CM* ($JT | $JV) $CM* ($H2 | $JV); -$CM* $JT $CM* ($H3 | $JT); +($H3 | $H2 | $JV | $JL) $CM* $JL; +($JT | $JV) $CM* ($H2 | $JV); +$JT $CM* ($H3 | $JT); # LB 27 -$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL); -$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL); -$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; +$IN $CM* ($H3 | $H2 | $JT | $JV | $JL); +$PO $CM* ($H3 | $H2 | $JT | $JV | $JL); + ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; # LB 28 -$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL); +($ALPlus | $HL) $CM* ($ALPlus | $HL); # LB 29 -$CM* ($ALPlus | $HL) $CM* $IS; +($ALPlus | $HL) $CM* $IS; # LB 30 -$CM* $OP $CM* ($ALPlus | $HL | $NU); -$CM* ($ALPlus | $HL | $NU) $CM* $CP; +$OP $CM* ($ALPlus | $HL | $NU); +($ALPlus | $HL | $NU) $CM* $CP; # LB 30a -$CM* $RI $CM* $RI; +# Pairs of Regional Indicators. +# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs, +# the second with an even number. Stripping away the cruft they look like +# [^RI] RI / (RI RI)+ ^RI; +# [^RI] RI RI / (RI RI)+ ^RI; +# +# Line Loose tailoring: Don't include NSX here. +[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; +[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; + +# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs. +$RI $CM* $RI; + +# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI". +$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL)); + + +# LB 30b Do not break between an Emoji Base and an Emoji Modifier +$EM $CM* $EB; + ## ------------------------------------------------- !!safe_reverse; # LB 9 -$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; -$CM+ $SP / .; +^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; +^$CM+ $SP / .; # LB 14 $SP+ $CM* $OP; @@ -673,6 +719,9 @@ $CM* ($HY | $BA) $CM* $HL; ($CM* ($IS | $SY))+ $CM* $NU; ($CL | $CP) $CM* ($NU | $IS | $SY); +# LB 30 +($CM* $RI)+; + # For dictionary-based break $dictionary $dictionary; @@ -689,6 +738,6 @@ $dictionary $dictionary; # turn off rule chaining. We don't want to move more # than necessary. # -[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary]; +^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary]; $dictionary $dictionary; diff --git a/icu4c/source/data/brkitr/rules/line_loose_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_cj.txt index 9934fe3aa1d..f06361dabf9 100644 --- a/icu4c/source/data/brkitr/rules/line_loose_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_loose_cj.txt @@ -1,12 +1,16 @@ -# Copyright (c) 2002-2015 International Business Machines Corporation and +# Copyright (c) 2002-2016 International Business Machines Corporation and # others. All Rights Reserved. # # file: line_loose_cj.txt # # Line Breaking Rules -# Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 34 for Unicode 8.0 +# Implement default line breaking as defined by +# Unicode Standard Annex #14 Revision 35 for Unicode 8.0 # http://www.unicode.org/reports/tr14/ +# +# Includes the Emoji breaking proposals from Unicode L2/16-011R3. +# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf +# # tailored as noted in 2nd paragraph below.. # # TODO: Rule LB 8 remains as it was in Unicode 5.2 @@ -33,8 +37,6 @@ # !!chain; -!!LBCMNoChain; - !!lookAheadHardBreak; # @@ -72,8 +74,13 @@ # See rule LB 19 for an example. # +# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available. + +$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918]; +$EM = [\U0001F3FB-\U0001F3FF]; + $AI = [:LineBreak = Ambiguous:]; -$AL = [:LineBreak = Alphabetic:]; +$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]]; $BAX = [\u2010 \u2013]; $BA = [[:LineBreak = Break_After:] - $BAX]; $BB = [:LineBreak = Break_Before:]; @@ -82,7 +89,7 @@ $B2 = [:LineBreak = Break_Both:]; $CB = [:LineBreak = Contingent_Break:]; $CJ = [:LineBreak = Conditional_Japanese_Starter:]; $CL = [:LineBreak = Close_Punctuation:]; -$CM = [:LineBreak = Combining_Mark:]; +$CM = [[:LineBreak = Combining_Mark:] \u200d]; $CP = [:LineBreak = Close_Parenthesis:]; $CR = [:LineBreak = Carriage_Return:]; $EXX = [\uFF01 \uFF1F]; @@ -92,7 +99,7 @@ $HL = [:LineBreak = Hebrew_Letter:]; $HY = [:LineBreak = Hyphen:]; $H2 = [:LineBreak = H2:]; $H3 = [:LineBreak = H3:]; -$ID = [[:LineBreak = Ideographic:] $CJ]; +$ID = [[:LineBreak = Ideographic:] $CJ [\u2764] - $EB]; $IN = [:LineBreak = Inseperable:]; $IS = [:LineBreak = Infix_Numeric:]; $JL = [:LineBreak = JL:]; @@ -117,6 +124,7 @@ $SY = [:LineBreak = Break_Symbols:]; $WJ = [:LineBreak = Word_Joiner:]; $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; +$ZWJ = [\u200d]; # Dictionary character set, for triggering language-based break engines. Currently # limited to LineBreak=Complex_Context. Note that this set only works in Unicode @@ -151,7 +159,6 @@ $HLcm = $HL $CM*; $HYcm = $HY $CM*; $H2cm = $H2 $CM*; $H3cm = $H3 $CM*; -$IDcm = $ID $CM*; $INcm = $IN $CM*; $IScm = $IS $CM*; $JLcm = $JL $CM*; @@ -184,6 +191,8 @@ $BB $CM+; $B2 $CM+; $CL $CM+; $CP $CM+; +$EB $CM+; +$EM $CM+; $EX $CM+; $EXX $CM+; $GL $CM+; @@ -236,7 +245,7 @@ $AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM]; # Rule LB 4, 5 Mandatory (Hard) breaks. # $LB4Breaks = [$BK $CR $LF $NL]; -$LB4NonBreaks = [^$BK $CR $LF $NL]; +$LB4NonBreaks = [^$BK $CR $LF $NL $CM]; $CR $LF {100}; # @@ -244,13 +253,13 @@ $CR $LF {100}; # $LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks. $CAN_CM $CM* $LB4Breaks {100}; -$CM+ $LB4Breaks {100}; +^$CM+ $LB4Breaks {100}; # LB 7 x SP # x ZW $LB4NonBreaks [$SP $ZW]; $CAN_CM $CM* [$SP $ZW]; -$CM+ [$SP $ZW]; +^$CM+ [$SP $ZW]; # # LB 8 Break after zero width space @@ -261,20 +270,23 @@ $CM+ [$SP $ZW]; $LB8Breaks = [$LB4Breaks $ZW]; $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; +# LB 8a ZWJ x ID Emoji proposal. +# +$ZWJ ($ID | $EB | $EM); -# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL -# $CM not covered by the above needs to behave like $AL +# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL +# $CM not covered by the above needs to behave like $AL # See definition of $CAN_CM. $CAN_CM $CM+; # Stick together any combining sequences that don't match other rules. -$CM+; +^$CM+; # # LB 11 Do not break before or after WORD JOINER & related characters. # $CAN_CM $CM* $WJcm; $LB8NonBreaks $WJcm; -$CM+ $WJcm; +^$CM+ $WJcm; $WJcm $CANT_CM; $WJcm $CAN_CM $CM*; @@ -285,14 +297,13 @@ $WJcm $CAN_CM $CM*; # $GLcm $CAN_CM $CM*; $GLcm $CANT_CM; - + # # LB 12a Do not break before NBSP and related characters ... # [^SP BA HY] x GL # [[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GLcm; -$CM+ GLcm; - +^$CM+ $GLcm; # @@ -301,23 +312,23 @@ $CM+ GLcm; # Do not include $EXX here $LB8NonBreaks $CL; $CAN_CM $CM* $CL; -$CM+ $CL; # by rule 10, stand-alone CM behaves as AL +^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $CP; $CAN_CM $CM* $CP; -$CM+ $CP; # by rule 10, stand-alone CM behaves as AL +^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $EX; $CAN_CM $CM* $EX; -$CM+ $EX; # by rule 10, stand-alone CM behaves as AL +^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $IS; $CAN_CM $CM* $IS; -$CM+ $IS; # by rule 10, stand-alone CM behaves as AL +^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $SY; $CAN_CM $CM* $SY; -$CM+ $SY; # by rule 10, stand-alone CM behaves as AL +^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL # @@ -349,12 +360,10 @@ $LB18Breaks = [$LB8Breaks $SP]; # LB 19 # x QU $LB18NonBreaks $CM* $QUcm; -$CM+ $QUcm; +^$CM+ $QUcm; # QU x $QUcm .?; -$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc. - # TODO: I don't think this rule is needed. # LB 20 @@ -368,13 +377,14 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # # DO allow breaks here before $BAXcm and $NSXcm, so don't include them $LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); +^$CM+ ($BAcm | $HYcm | $NScm); $BBcm [^$CB]; # $BB x $BBcm $LB20NonBreaks $CM*; # LB 21a Don't break after Hebrew + Hyphen # HL (HY | BA) x -# +# $HLcm ($HYcm | $BAcm | $BAXcm) [^$CB]?; # LB 21b (forward) Don't break between SY and HL @@ -383,19 +393,19 @@ $SYcm $HLcm; # LB 22 ($ALcm | $HLcm) $INcm; -$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL +^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL $EXcm $INcm; -$IDcm $INcm; +($ID | $EB | $EM) $CM* $INcm; # $INcm $INcm; # delete this rule for CSS loose $NUcm $INcm; -# LB 23 +# $LB 23 # Do not include $POX here -$IDcm $POcm; +($ID | $EB | $EM) $CM* $POcm; $ALcm $NUcm; # includes $LB19 $HLcm $NUcm; -$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL +^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL $NUcm $ALcm; $NUcm $HLcm; @@ -403,7 +413,7 @@ $NUcm $HLcm; # LB 24 # # Do not include $PRX here -$PRcm $IDcm; +$PRcm ($ID | $EB | $EM); $PRcm ($ALcm | $HLcm); ($POcm | $POXcm) ($ALcm | $HLcm); @@ -429,18 +439,27 @@ $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm); # LB 28 Do not break between alphabetics # ($ALcm | $HLcm) ($ALcm | $HLcm); -$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL +^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL # LB 29 $IScm ($ALcm | $HLcm); # LB 30 ($ALcm | $HLcm | $NUcm) $OPcm; -$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL. +^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL. $CPcm ($ALcm | $HLcm | $NUcm); -# LB 30a Do not break between regional indicators. -$RIcm $RIcm; +# LB 30a Do not break between regional indicators. Break after pairs of them. +# Tricky interaction with LB8a: ZWJ x ID +$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}]; +$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}]; +$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}]; + +$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}]; +$RIcm $RIcm $ZWJ ($ID | $EB | $EM); + +# LB 30b Do not break between an Emoji Base and an Emoji Modifier +$EB $CM* $EM; # # Reverse Rules. @@ -449,39 +468,41 @@ $RIcm $RIcm; !!reverse; -$CM+ $ALPlus; -$CM+ $BA; -$CM+ $BAX; -$CM+ $BB; -$CM+ $B2; -$CM+ $CL; -$CM+ $CP; -$CM+ $EX; -$CM+ $EXX; -$CM+ $GL; -$CM+ $HL; -$CM+ $HY; -$CM+ $H2; -$CM+ $H3; -$CM+ $ID; -$CM+ $IN; -$CM+ $IS; -$CM+ $JL; -$CM+ $JV; -$CM+ $JT; -$CM+ $NS; -$CM+ $NSX; -$CM+ $NU; -$CM+ $OP; -$CM+ $PO; -$CM+ $POX; -$CM+ $PR; -$CM+ $PRX; -$CM+ $QU; -$CM+ $RI; -$CM+ $SY; -$CM+ $WJ; -$CM+; +^$CM+ $ALPlus; +^$CM+ $BA; +^$CM+ $BAX; +^$CM+ $BB; +^$CM+ $B2; +^$CM+ $CL; +^$CM+ $CP; +^$CM+ $EB; +^$CM+ $EM; +^$CM+ $EX; +^$CM+ $EXX; +^$CM+ $GL; +^$CM+ $HL; +^$CM+ $HY; +^$CM+ $H2; +^$CM+ $H3; +^$CM+ $ID; +^$CM+ $IN; +^$CM+ $IS; +^$CM+ $JL; +^$CM+ $JV; +^$CM+ $JT; +^$CM+ $NS; +^$CM+ $NSX; +^$CM+ $NU; +^$CM+ $OP; +^$CM+ $PO; +^$CM+ $POX; +^$CM+ $PR; +^$CM+ $PRX; +^$CM+ $QU; +^$CM+ $RI; +^$CM+ $SY; +^$CM+ $WJ; +^$CM+; # @@ -493,14 +514,14 @@ $AL_FOLLOW $CM+ / ( [$BK $CR $LF $NL $ZW {eof}] | $SP+ $CM+ $SP | $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break. - # LB14 says OP SP* x . + # LB14 says OP SP* x . # becomes OP SP* x AL # becomes OP SP* x CM+ AL_FOLLOW # # Further note: the $AL in [$AL {eof}] is only to work around # a rule compiler bug which complains about # empty sets otherwise. - + # # Sequences of the form (shown forwards) # [CANT_CM] [CM] [PR] @@ -512,7 +533,7 @@ $AL_FOLLOW $CM+ / ( -# LB 4, 5, 5 +# LB 4, 5, 6 $LB4Breaks [$LB4NonBreaks-$CM]; $LB4Breaks $CM+ $CAN_CM; @@ -529,30 +550,37 @@ $LF $CR; # Requires an engine enhancement. # / $SP* $ZW +# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3 +# The ZWJ will look like a CM to whatever precedes it. +# +($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?; + + # LB 9,10 Combining marks. # X $CM needs to behave like X, where X is not $SP or controls. # $CM not covered by the above needs to behave like $AL # Stick together any combining sequences that don't match other rules. -$CM+ $CAN_CM; +^$CM+ $CAN_CM; # LB 11 -$CM* $WJ $CM* $CAN_CM; -$CM* $WJ [$LB8NonBreaks-$CM]; +# +$WJ $CM* $CAN_CM; +$WJ [$LB8NonBreaks-$CM]; $CANT_CM $CM* $WJ; -$CM* $CAN_CM $CM* $WJ; +$CAN_CM $CM* $WJ; # LB 12a # [^SP BA HY] x GL # -$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]]; +$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]]; # LB 12 # GL x # $CANT_CM $CM* $GL; -$CM* $CAN_CM $CM* $GL; +$CAN_CM $CM* $GL; # LB 13 @@ -574,29 +602,27 @@ $SY [$LB8NonBreaks-$CM]; # OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY) # This really wants to chain at the $CM+ (which is acting as an $AL) # except for $CM chaining being disabled. -[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP; +[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP; # LB 14 OP SP* x # -$CM* $CAN_CM $SP* $CM* $OP; +$CAN_CM $SP* $CM* $OP; $CANT_CM $SP* $CM* $OP; $AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP - - $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP; -$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP; -$SY $CM $SP+ $OP; # TODO: Experiment. Remove. + $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP; +$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP; # LB 15 -$CM* $OP $SP* $CM* $QU; +$OP $SP* $CM* $QU; # LB 16 # Don't include $NSX here -$CM* $NS $SP* $CM* ($CL | $CP); +$NS $SP* $CM* ($CL | $CP); # LB 17 -$CM* $B2 $SP* $CM* $B2; +$B2 $SP* $CM* $B2; # LB 18 break after spaces # Nothing explicit needed here. @@ -605,13 +631,13 @@ $CM* $B2 $SP* $CM* $B2; # # LB 19 # -$CM* $QU $CM* $CAN_CM; # . x QU -$CM* $QU $LB18NonBreaks; +$QU $CM* $CAN_CM; # . x QU +$QU $LB18NonBreaks; -$CM* $CAN_CM $CM* $QU; # QU x . +$CAN_CM $CM* $QU; # QU x . $CANT_CM $CM* $QU; - + # # LB 20 Break before and after CB. # nothing needed here. @@ -619,73 +645,90 @@ $CM* $CAN_CM $CM* $QU; # QU x . # LB 21 # Don't include $BAX or $NSX here -$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) +($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) -$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x . -[^$CB] $CM* $BB; # +[$LB20NonBreaks-$CM] $CM* $BB; # BB x . +[^$CB] $CM* $BB; # # LB21a [^$CB]? $CM* ($HY | $BA | $BAX) $CM* $HL; # LB21b (reverse) -$CM* $HL $CM* $SY; +$HL $CM* $SY; # LB 22 -$CM* $IN $CM* ($ALPlus | $HL); -$CM* $IN $CM* $EX; -$CM* $IN $CM* $ID; -# $CM* $IN $CM* $IN; # delete this rule for CSS loose +$IN $CM* ($ALPlus | $HL); +$IN $CM* $EX; +$IN $CM* ($ID | $EB | $EM); +# $IN $CM* $IN; # delete this rule for CSS loose $CM* $IN $CM* $NU; # LB 23 # Do not include $POX here -$CM* $PO $CM* $ID; -$CM* $NU $CM* ($ALPlus | $HL); -$CM* ($ALPlus | $HL) $CM* $NU; +$PO $CM* ($ID | $EB | $EM); +$NU $CM* ($ALPlus | $HL); +($ALPlus | $HL) $CM* $NU; # LB 24 # Do not include $PRX here -$CM* $ID $CM* $PR; -$CM* ($ALPlus | $HL) $CM* $PR; -$CM* ($ALPlus | $HL) $CM* ($PO | $POX); - +($ID | $EB | $EM) $CM* $PR; +($ALPlus | $HL) $CM* $PR; +($ALPlus | $HL) $CM* ($PO | $POX); # LB 25 # Here do not include $POX at the beginning or $PRX at the end ($CM* ($PR | $PRX | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO | $POX))?; # LB 26 -$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL; -$CM* ($JT | $JV) $CM* ($H2 | $JV); -$CM* $JT $CM* ($H3 | $JT); +($H3 | $H2 | $JV | $JL) $CM* $JL; +($JT | $JV) $CM* ($H2 | $JV); +$JT $CM* ($H3 | $JT); # LB 27 # Do not include $POX or $PRX here -$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL); -$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL); -$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; +$IN $CM* ($H3 | $H2 | $JT | $JV | $JL); +$PO $CM* ($H3 | $H2 | $JT | $JV | $JL); +($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; # LB 28 -$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL); +($ALPlus | $HL) $CM* ($ALPlus | $HL); # LB 29 -$CM* ($ALPlus | $HL) $CM* $IS; +($ALPlus | $HL) $CM* $IS; # LB 30 -$CM* $OP $CM* ($ALPlus | $HL | $NU); -$CM* ($ALPlus | $HL | $NU) $CM* $CP; +$OP $CM* ($ALPlus | $HL | $NU); +($ALPlus | $HL | $NU) $CM* $CP; # LB 30a -$CM* $RI $CM* $RI; +# Pairs of Regional Indicators. +# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs, +# the second with an even number. Stripping away the cruft they look like +# [^RI] RI / (RI RI)+ ^RI; +# [^RI] RI RI / (RI RI)+ ^RI; +# +[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; +[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; + +# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs. +$RI $CM* $RI; + +# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI". +$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL)); + + +# LB 30b Do not break between an Emoji Base and an Emoji Modifier +$EM $CM* $EB; + ## ------------------------------------------------- !!safe_reverse; # LB 9 -$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; -$CM+ $SP / .; +^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; +^$CM+ $SP / .; # LB 14 $SP+ $CM* $OP; @@ -706,6 +749,9 @@ $CM* ($HY | $BA | $BAX) $CM* $HL; ($CM* ($IS | $SY))+ $CM* $NU; ($CL | $CP) $CM* ($NU | $IS | $SY); +# LB 30 +($CM* $RI)+; + # For dictionary-based break $dictionary $dictionary; @@ -722,6 +768,6 @@ $dictionary $dictionary; # turn off rule chaining. We don't want to move more # than necessary. # -[$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $BAX $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $BAX $dictionary]; +^[$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $BAX $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $BAX $RI $ZWJ $dictionary]; $dictionary $dictionary; diff --git a/icu4c/source/data/brkitr/rules/line_loose_fi.txt b/icu4c/source/data/brkitr/rules/line_loose_fi.txt index c5dae9f85ee..db23cb8eff7 100644 --- a/icu4c/source/data/brkitr/rules/line_loose_fi.txt +++ b/icu4c/source/data/brkitr/rules/line_loose_fi.txt @@ -1,13 +1,17 @@ -# Copyright (c) 2002-2015 International Business Machines Corporation and +# Copyright (c) 2002-2016 International Business Machines Corporation and # others. All Rights Reserved. # # file: line_loose_fi.txt # # Line Breaking Rules -# Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 34 for Unicode 8.0 +# Implement default line breaking as defined by +# Unicode Standard Annex #14 Revision 35 for Unicode 8.0 # http://www.unicode.org/reports/tr14/ -# tailored as noted in 2nd paragraph below.. +# +# Includes the Emoji breaking proposals from Unicode L2/16-011R3. +# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf +# +# tailored as noted in 2nd paragraph below. # # TODO: Rule LB 8 remains as it was in Unicode 5.2 # This is only because of a limitation of ICU break engine implementation, @@ -24,8 +28,6 @@ # !!chain; -!!LBCMNoChain; - !!lookAheadHardBreak; # @@ -63,8 +65,13 @@ # See rule LB 19 for an example. # +# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available. + +$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918]; +$EM = [\U0001F3FB-\U0001F3FF]; + $AI = [:LineBreak = Ambiguous:]; -$AL = [:LineBreak = Alphabetic:]; +$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]]; $BA = [[:LineBreak = Break_After:] - [\u2010]]; $HH = [\u2010]; $BB = [:LineBreak = Break_Before:]; @@ -73,7 +80,7 @@ $B2 = [:LineBreak = Break_Both:]; $CB = [:LineBreak = Contingent_Break:]; $CJ = [:LineBreak = Conditional_Japanese_Starter:]; $CL = [:LineBreak = Close_Punctuation:]; -$CM = [:LineBreak = Combining_Mark:]; +$CM = [[:LineBreak = Combining_Mark:] \u200d]; $CP = [:LineBreak = Close_Parenthesis:]; $CR = [:LineBreak = Carriage_Return:]; $EX = [:LineBreak = Exclamation:]; @@ -82,7 +89,7 @@ $HL = [:LineBreak = Hebrew_Letter:]; $HY = [:LineBreak = Hyphen:]; $H2 = [:LineBreak = H2:]; $H3 = [:LineBreak = H3:]; -$ID = [[:LineBreak = Ideographic:] $CJ]; +$ID = [[:LineBreak = Ideographic:]$CJ[\u2764] - $EB]; $IN = [:LineBreak = Inseperable:]; $IS = [:LineBreak = Infix_Numeric:]; $JL = [:LineBreak = JL:]; @@ -105,6 +112,7 @@ $SY = [:LineBreak = Break_Symbols:]; $WJ = [:LineBreak = Word_Joiner:]; $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; +$ZWJ = [\u200d]; # Dictionary character set, for triggering language-based break engines. Currently # limited to LineBreak=Complex_Context. Note that this set only works in Unicode @@ -138,7 +146,6 @@ $HLcm = $HL $CM*; $HYcm = $HY $CM*; $H2cm = $H2 $CM*; $H3cm = $H3 $CM*; -$IDcm = $ID $CM*; $INcm = $IN $CM*; $IScm = $IS $CM*; $JLcm = $JL $CM*; @@ -169,6 +176,8 @@ $BB $CM+; $B2 $CM+; $CL $CM+; $CP $CM+; +$EB $CM+; +$EM $CM+; $EX $CM+; $GL $CM+; $HL $CM+; @@ -218,7 +227,7 @@ $AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM]; # Rule LB 4, 5 Mandatory (Hard) breaks. # $LB4Breaks = [$BK $CR $LF $NL]; -$LB4NonBreaks = [^$BK $CR $LF $NL]; +$LB4NonBreaks = [^$BK $CR $LF $NL $CM]; $CR $LF {100}; # @@ -226,13 +235,13 @@ $CR $LF {100}; # $LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks. $CAN_CM $CM* $LB4Breaks {100}; -$CM+ $LB4Breaks {100}; +^$CM+ $LB4Breaks {100}; # LB 7 x SP # x ZW $LB4NonBreaks [$SP $ZW]; $CAN_CM $CM* [$SP $ZW]; -$CM+ [$SP $ZW]; +^$CM+ [$SP $ZW]; # # LB 8 Break after zero width space @@ -243,20 +252,23 @@ $CM+ [$SP $ZW]; $LB8Breaks = [$LB4Breaks $ZW]; $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; +# LB 8a ZWJ x ID Emoji proposal. +# +$ZWJ ($ID | $EB | $EM); -# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL -# $CM not covered by the above needs to behave like $AL +# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL +# $CM not covered by the above needs to behave like $AL # See definition of $CAN_CM. $CAN_CM $CM+; # Stick together any combining sequences that don't match other rules. -$CM+; +^$CM+; # # LB 11 Do not break before or after WORD JOINER & related characters. # $CAN_CM $CM* $WJcm; $LB8NonBreaks $WJcm; -$CM+ $WJcm; +^$CM+ $WJcm; $WJcm $CANT_CM; $WJcm $CAN_CM $CM*; @@ -267,13 +279,13 @@ $WJcm $CAN_CM $CM*; # $GLcm $CAN_CM $CM*; $GLcm $CANT_CM; - + # # LB 12a Do not break before NBSP and related characters ... # [^SP BA HY] x GL # [[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GLcm; -$CM+ GLcm; +^$CM+ $GLcm; @@ -282,23 +294,23 @@ $CM+ GLcm; # $LB8NonBreaks $CL; $CAN_CM $CM* $CL; -$CM+ $CL; # by rule 10, stand-alone CM behaves as AL +^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $CP; $CAN_CM $CM* $CP; -$CM+ $CP; # by rule 10, stand-alone CM behaves as AL +^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $EX; $CAN_CM $CM* $EX; -$CM+ $EX; # by rule 10, stand-alone CM behaves as AL +^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $IS; $CAN_CM $CM* $IS; -$CM+ $IS; # by rule 10, stand-alone CM behaves as AL +^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $SY; $CAN_CM $CM* $SY; -$CM+ $SY; # by rule 10, stand-alone CM behaves as AL +^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL # @@ -330,13 +342,10 @@ $LB18Breaks = [$LB8Breaks $SP]; # LB 19 # x QU $LB18NonBreaks $CM* $QUcm; -$CM+ $QUcm; +^$CM+ $QUcm; # QU x $QUcm .?; -$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc. - # TODO: I don't think this rule is needed. - # LB 20 # $CB @@ -352,13 +361,14 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; $LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm) / $AL; $LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm); ($HY | $HH) $AL; +^$CM+ ($BAcm | $HHcm | $HYcm | $NScm); $BBcm [^$CB]; # $BB x $BBcm $LB20NonBreaks $CM*; # LB 21a Don't break after Hebrew + Hyphen # HL (HY | BA) x -# +# $HLcm ($HYcm | $BAcm | $HHcm) [^$CB]?; # LB 21b (forward) Don't break between SY and HL @@ -367,25 +377,25 @@ $SYcm $HLcm; # LB 22 ($ALcm | $HLcm) $INcm; -$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL +^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL $EXcm $INcm; -$IDcm $INcm; +($ID | $EB | $EM) $CM* $INcm; $INcm $INcm; $NUcm $INcm; # $LB 23 -$IDcm $POcm; +($ID | $EB | $EM) $CM* $POcm; $ALcm $NUcm; # includes $LB19 $HLcm $NUcm; -$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL +^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL $NUcm $ALcm; $NUcm $HLcm; # # LB 24 # -$PRcm $IDcm; +$PRcm ($ID | $EB | $EM); $PRcm ($ALcm | $HLcm); $POcm ($ALcm | $HLcm); @@ -409,18 +419,27 @@ $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm); # LB 28 Do not break between alphabetics # ($ALcm | $HLcm) ($ALcm | $HLcm); -$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL +^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL # LB 29 $IScm ($ALcm | $HLcm); # LB 30 ($ALcm | $HLcm | $NUcm) $OPcm; -$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL. +^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL. $CPcm ($ALcm | $HLcm | $NUcm); -# LB 30a Do not break between regional indicators. -$RIcm $RIcm; +# LB 30a Do not break between regional indicators. Break after pairs of them. +# Tricky interaction with LB8a: ZWJ x ID +$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HH $HY $NS $NSX $CM] {eof}]; +$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HH $HY $NS $NSX $CM $ID $EB $EM] {eof}]; +$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HH $HY $NS $NSX $CM] {eof}]; + +$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HH $HY $NS $NSX {eof}]; +$RIcm $RIcm $ZWJ ($ID | $EB | $EM); + +# LB 30b Do not break between an Emoji Base and an Emoji Modifier +$EB $CM* $EM; # # Reverse Rules. @@ -429,36 +448,38 @@ $RIcm $RIcm; !!reverse; -$CM+ $ALPlus; -$CM+ $BA; -$CM+ $HH; -$CM+ $BB; -$CM+ $B2; -$CM+ $CL; -$CM+ $CP; -$CM+ $EX; -$CM+ $GL; -$CM+ $HL; -$CM+ $HY; -$CM+ $H2; -$CM+ $H3; -$CM+ $ID; -$CM+ $IN; -$CM+ $IS; -$CM+ $JL; -$CM+ $JV; -$CM+ $JT; -$CM+ $NS; -$CM+ $NSX; -$CM+ $NU; -$CM+ $OP; -$CM+ $PO; -$CM+ $PR; -$CM+ $QU; -$CM+ $RI; -$CM+ $SY; -$CM+ $WJ; -$CM+; +^$CM+ $ALPlus; +^$CM+ $BA; +^$CM+ $BB; +^$CM+ $B2; +^$CM+ $CL; +^$CM+ $CP; +^$CM+ $EB; +^$CM+ $EM; +^$CM+ $EX; +^$CM+ $GL; +^$CM+ $HH; +^$CM+ $HL; +^$CM+ $HY; +^$CM+ $H2; +^$CM+ $H3; +^$CM+ $ID; +^$CM+ $IN; +^$CM+ $IS; +^$CM+ $JL; +^$CM+ $JV; +^$CM+ $JT; +^$CM+ $NS; +^$CM+ $NSX; +^$CM+ $NU; +^$CM+ $OP; +^$CM+ $PO; +^$CM+ $PR; +^$CM+ $QU; +^$CM+ $RI; +^$CM+ $SY; +^$CM+ $WJ; +^$CM+; # @@ -470,14 +491,14 @@ $AL_FOLLOW $CM+ / ( [$BK $CR $LF $NL $ZW {eof}] | $SP+ $CM+ $SP | $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break. - # LB14 says OP SP* x . + # LB14 says OP SP* x . # becomes OP SP* x AL # becomes OP SP* x CM+ AL_FOLLOW # # Further note: the $AL in [$AL {eof}] is only to work around # a rule compiler bug which complains about # empty sets otherwise. - + # # Sequences of the form (shown forwards) # [CANT_CM] [CM] [PR] @@ -489,7 +510,7 @@ $AL_FOLLOW $CM+ / ( -# LB 4, 5, 5 +# LB 4, 5, 6 $LB4Breaks [$LB4NonBreaks-$CM]; $LB4Breaks $CM+ $CAN_CM; @@ -506,30 +527,37 @@ $LF $CR; # Requires an engine enhancement. # / $SP* $ZW +# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3 +# The ZWJ will look like a CM to whatever precedes it. +# +($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?; + + # LB 9,10 Combining marks. # X $CM needs to behave like X, where X is not $SP or controls. # $CM not covered by the above needs to behave like $AL # Stick together any combining sequences that don't match other rules. -$CM+ $CAN_CM; +^$CM+ $CAN_CM; # LB 11 -$CM* $WJ $CM* $CAN_CM; -$CM* $WJ [$LB8NonBreaks-$CM]; +# +$WJ $CM* $CAN_CM; +$WJ [$LB8NonBreaks-$CM]; $CANT_CM $CM* $WJ; -$CM* $CAN_CM $CM* $WJ; +$CAN_CM $CM* $WJ; # LB 12a # [^SP BA HY] x GL # -$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]]; +$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]]; # LB 12 # GL x # $CANT_CM $CM* $GL; -$CM* $CAN_CM $CM* $GL; +$CAN_CM $CM* $GL; # LB 13 @@ -550,29 +578,27 @@ $SY [$LB8NonBreaks-$CM]; # OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY) # This really wants to chain at the $CM+ (which is acting as an $AL) # except for $CM chaining being disabled. -[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP; +[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP; # LB 14 OP SP* x # -$CM* $CAN_CM $SP* $CM* $OP; +$CAN_CM $SP* $CM* $OP; $CANT_CM $SP* $CM* $OP; $AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP - - $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP; -$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP; -$SY $CM $SP+ $OP; # TODO: Experiment. Remove. + $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP; +$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP; # LB 15 -$CM* $OP $SP* $CM* $QU; +$OP $SP* $CM* $QU; # LB 16 # Don't include $NSX here -$CM* $NS $SP* $CM* ($CL | $CP); +$NS $SP* $CM* ($CL | $CP); # LB 17 -$CM* $B2 $SP* $CM* $B2; +$B2 $SP* $CM* $B2; # LB 18 break after spaces # Nothing explicit needed here. @@ -581,13 +607,13 @@ $CM* $B2 $SP* $CM* $B2; # # LB 19 # -$CM* $QU $CM* $CAN_CM; # . x QU -$CM* $QU $LB18NonBreaks; +$QU $CM* $CAN_CM; # . x QU +$QU $LB18NonBreaks; -$CM* $CAN_CM $CM* $QU; # QU x . +$CAN_CM $CM* $QU; # QU x . $CANT_CM $CM* $QU; - + # # LB 20 Break before and after CB. # nothing needed here. @@ -598,69 +624,87 @@ $AL ($HY | $HH) / $SP; # LB 21 # Don't include $NSX here -$CM* ($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) +($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) -$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x . -[^$CB] $CM* $BB; # +[$LB20NonBreaks-$CM] $CM* $BB; # BB x . +[^$CB] $CM* $BB; # # LB21a [^$CB] $CM* ($HY | $BA | $HH) $CM* $HL; # LB21b (reverse) -$CM* $HL $CM* $SY; +$HL $CM* $SY; # LB 22 -$CM* $IN $CM* ($ALPlus | $HL); -$CM* $IN $CM* $EX; -$CM* $IN $CM* $ID; -$CM* $IN $CM* $IN; -$CM* $IN $CM* $NU; +$IN $CM* ($ALPlus | $HL); +$IN $CM* $EX; +$IN $CM* ($ID | $EB | $EM); +$IN $CM* $IN; +$IN $CM* $NU; # LB 23 -$CM* $PO $CM* $ID; -$CM* $NU $CM* ($ALPlus | $HL); -$CM* ($ALPlus | $HL) $CM* $NU; +$PO $CM* ($ID | $EB | $EM); +$NU $CM* ($ALPlus | $HL); +($ALPlus | $HL) $CM* $NU; # LB 24 -$CM* $ID $CM* $PR; -$CM* ($ALPlus | $HL) $CM* $PR; -$CM* ($ALPlus | $HL) $CM* $PO; +($ID | $EB | $EM) $CM* $PR; +($ALPlus | $HL) $CM* $PR; +($ALPlus | $HL) $CM* $PO; # LB 25 ($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?; # LB 26 -$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL; -$CM* ($JT | $JV) $CM* ($H2 | $JV); -$CM* $JT $CM* ($H3 | $JT); +($H3 | $H2 | $JV | $JL) $CM* $JL; +($JT | $JV) $CM* ($H2 | $JV); +$JT $CM* ($H3 | $JT); # LB 27 -$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL); -$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL); -$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; +$IN $CM* ($H3 | $H2 | $JT | $JV | $JL); +$PO $CM* ($H3 | $H2 | $JT | $JV | $JL); + ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; # LB 28 -$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL); +($ALPlus | $HL) $CM* ($ALPlus | $HL); # LB 29 -$CM* ($ALPlus | $HL) $CM* $IS; +($ALPlus | $HL) $CM* $IS; # LB 30 -$CM* $OP $CM* ($ALPlus | $HL | $NU); -$CM* ($ALPlus | $HL | $NU) $CM* $CP; +$OP $CM* ($ALPlus | $HL | $NU); +($ALPlus | $HL | $NU) $CM* $CP; # LB 30a -$CM* $RI $CM* $RI; +# Pairs of Regional Indicators. +# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs, +# the second with an even number. Stripping away the cruft they look like +# [^RI] RI / (RI RI)+ ^RI; +# [^RI] RI RI / (RI RI)+ ^RI; +# +[{bof} $NS $NSX $HY $BA $HH $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; +[{bof} $NS $NSX $HY $BA $HH $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; + +# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs. +$RI $CM* $RI; + +# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI". +$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL)); + + +# LB 30b Do not break between an Emoji Base and an Emoji Modifier +$EM $CM* $EB; + ## ------------------------------------------------- !!safe_reverse; # LB 9 -$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; -$CM+ $SP / .; +^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; +^$CM+ $SP / .; # LB 14 $SP+ $CM* $OP; @@ -675,12 +719,15 @@ $SP+ $CM* ($CL | $CP); $SP+ $CM* $B2; # LB 21 -$CM* ($HY | $BA | $HH) $CM* $HL; +($HY | $BA | $HH) $CM* $HL; # LB 25 ($CM* ($IS | $SY))+ $CM* $NU; ($CL | $CP) $CM* ($NU | $IS | $SY); +# LB 30 +($CM* $RI)+; + # For dictionary-based break $dictionary $dictionary; @@ -697,6 +744,6 @@ $dictionary $dictionary; # turn off rule chaining. We don't want to move more # than necessary. # -[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary]; +^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $HH $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $HH $RI $ZWJ $dictionary]; $dictionary $dictionary; diff --git a/icu4c/source/data/brkitr/rules/line_normal.txt b/icu4c/source/data/brkitr/rules/line_normal.txt index b03d01fc151..c19a77aa3cb 100644 --- a/icu4c/source/data/brkitr/rules/line_normal.txt +++ b/icu4c/source/data/brkitr/rules/line_normal.txt @@ -1,13 +1,17 @@ -# Copyright (c) 2002-2015 International Business Machines Corporation and +# Copyright (c) 2002-2016 International Business Machines Corporation and # others. All Rights Reserved. # # file: line_normal.txt # # Line Breaking Rules -# Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 34 for Unicode 8.0 +# Implement default line breaking as defined by +# Unicode Standard Annex #14 Revision 35 for Unicode 8.0 # http://www.unicode.org/reports/tr14/ -# tailored as noted in 2nd paragraph below.. +# +# Includes the Emoji breaking proposals from Unicode L2/16-011R3. +# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf +# +# tailored as noted in 2nd paragraph below. # # TODO: Rule LB 8 remains as it was in Unicode 5.2 # This is only because of a limitation of ICU break engine implementation, @@ -23,8 +27,6 @@ # !!chain; -!!LBCMNoChain; - !!lookAheadHardBreak; # @@ -62,8 +64,13 @@ # See rule LB 19 for an example. # +# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available. + +$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918]; +$EM = [\U0001F3FB-\U0001F3FF]; + $AI = [:LineBreak = Ambiguous:]; -$AL = [:LineBreak = Alphabetic:]; +$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]]; $BA = [:LineBreak = Break_After:]; $BB = [:LineBreak = Break_Before:]; $BK = [:LineBreak = Mandatory_Break:]; @@ -71,7 +78,7 @@ $B2 = [:LineBreak = Break_Both:]; $CB = [:LineBreak = Contingent_Break:]; $CJ = [:LineBreak = Conditional_Japanese_Starter:]; $CL = [:LineBreak = Close_Punctuation:]; -$CM = [:LineBreak = Combining_Mark:]; +$CM = [[:LineBreak = Combining_Mark:] \u200d]; $CP = [:LineBreak = Close_Parenthesis:]; $CR = [:LineBreak = Carriage_Return:]; $EX = [:LineBreak = Exclamation:]; @@ -80,7 +87,7 @@ $HL = [:LineBreak = Hebrew_Letter:]; $HY = [:LineBreak = Hyphen:]; $H2 = [:LineBreak = H2:]; $H3 = [:LineBreak = H3:]; -$ID = [[:LineBreak = Ideographic:] $CJ]; +$ID = [[:LineBreak = Ideographic:] $CJ [\u2764] - $EB]; $IN = [:LineBreak = Inseperable:]; $IS = [:LineBreak = Infix_Numeric:]; $JL = [:LineBreak = JL:]; @@ -102,6 +109,7 @@ $SY = [:LineBreak = Break_Symbols:]; $WJ = [:LineBreak = Word_Joiner:]; $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; +$ZWJ = [\u200d]; # Dictionary character set, for triggering language-based break engines. Currently # limited to LineBreak=Complex_Context. Note that this set only works in Unicode @@ -134,7 +142,6 @@ $HLcm = $HL $CM*; $HYcm = $HY $CM*; $H2cm = $H2 $CM*; $H3cm = $H3 $CM*; -$IDcm = $ID $CM*; $INcm = $IN $CM*; $IScm = $IS $CM*; $JLcm = $JL $CM*; @@ -163,6 +170,8 @@ $BB $CM+; $B2 $CM+; $CL $CM+; $CP $CM+; +$EB $CM+; +$EM $CM+; $EX $CM+; $GL $CM+; $HL $CM+; @@ -211,7 +220,7 @@ $AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM]; # Rule LB 4, 5 Mandatory (Hard) breaks. # $LB4Breaks = [$BK $CR $LF $NL]; -$LB4NonBreaks = [^$BK $CR $LF $NL]; +$LB4NonBreaks = [^$BK $CR $LF $NL $CM]; $CR $LF {100}; # @@ -219,13 +228,13 @@ $CR $LF {100}; # $LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks. $CAN_CM $CM* $LB4Breaks {100}; -$CM+ $LB4Breaks {100}; +^$CM+ $LB4Breaks {100}; # LB 7 x SP # x ZW $LB4NonBreaks [$SP $ZW]; $CAN_CM $CM* [$SP $ZW]; -$CM+ [$SP $ZW]; +^$CM+ [$SP $ZW]; # # LB 8 Break after zero width space @@ -236,20 +245,23 @@ $CM+ [$SP $ZW]; $LB8Breaks = [$LB4Breaks $ZW]; $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; +# LB 8a ZWJ x ID Emoji proposal. +# +$ZWJ ($ID | $EB | $EM); -# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL -# $CM not covered by the above needs to behave like $AL +# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL +# $CM not covered by the above needs to behave like $AL # See definition of $CAN_CM. $CAN_CM $CM+; # Stick together any combining sequences that don't match other rules. -$CM+; +^$CM+; # # LB 11 Do not break before or after WORD JOINER & related characters. # $CAN_CM $CM* $WJcm; $LB8NonBreaks $WJcm; -$CM+ $WJcm; +^$CM+ $WJcm; $WJcm $CANT_CM; $WJcm $CAN_CM $CM*; @@ -260,13 +272,13 @@ $WJcm $CAN_CM $CM*; # $GLcm $CAN_CM $CM*; $GLcm $CANT_CM; - + # # LB 12a Do not break before NBSP and related characters ... # [^SP BA HY] x GL # [[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GLcm; -$CM+ GLcm; +^$CM+ $GLcm; @@ -275,23 +287,23 @@ $CM+ GLcm; # $LB8NonBreaks $CL; $CAN_CM $CM* $CL; -$CM+ $CL; # by rule 10, stand-alone CM behaves as AL +^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $CP; $CAN_CM $CM* $CP; -$CM+ $CP; # by rule 10, stand-alone CM behaves as AL +^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $EX; $CAN_CM $CM* $EX; -$CM+ $EX; # by rule 10, stand-alone CM behaves as AL +^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $IS; $CAN_CM $CM* $IS; -$CM+ $IS; # by rule 10, stand-alone CM behaves as AL +^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $SY; $CAN_CM $CM* $SY; -$CM+ $SY; # by rule 10, stand-alone CM behaves as AL +^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL # @@ -321,12 +333,10 @@ $LB18Breaks = [$LB8Breaks $SP]; # LB 19 # x QU $LB18NonBreaks $CM* $QUcm; -$CM+ $QUcm; +^$CM+ $QUcm; # QU x $QUcm .?; -$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc. - # TODO: I don't think this rule is needed. # LB 20 @@ -338,14 +348,15 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # LB 21 x (BA | HY | NS) # BB x # -$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); +$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); +^$CM+ ($BAcm | $HYcm | $NScm); $BBcm [^$CB]; # $BB x $BBcm $LB20NonBreaks $CM*; # LB 21a Don't break after Hebrew + Hyphen # HL (HY | BA) x -# +# $HLcm ($HYcm | $BAcm) [^$CB]?; # LB 21b (forward) Don't break between SY and HL @@ -354,25 +365,25 @@ $SYcm $HLcm; # LB 22 ($ALcm | $HLcm) $INcm; -$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL +^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL $EXcm $INcm; -$IDcm $INcm; +($ID | $EB | $EM) $CM* $INcm; $INcm $INcm; $NUcm $INcm; # $LB 23 -$IDcm $POcm; +($ID | $EB | $EM) $CM* $POcm; $ALcm $NUcm; # includes $LB19 $HLcm $NUcm; -$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL +^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL $NUcm $ALcm; $NUcm $HLcm; # # LB 24 # -$PRcm $IDcm; +$PRcm ($ID | $EB | $EM); $PRcm ($ALcm | $HLcm); $POcm ($ALcm | $HLcm); @@ -396,18 +407,27 @@ $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm); # LB 28 Do not break between alphabetics # ($ALcm | $HLcm) ($ALcm | $HLcm); -$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL +^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL # LB 29 $IScm ($ALcm | $HLcm); # LB 30 ($ALcm | $HLcm | $NUcm) $OPcm; -$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL. +^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL. $CPcm ($ALcm | $HLcm | $NUcm); -# LB 30a Do not break between regional indicators. -$RIcm $RIcm; +# LB 30a Do not break between regional indicators. Break after pairs of them. +# Tricky interaction with LB8a: ZWJ x ID +$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}]; +$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}]; +$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}]; + +$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}]; +$RIcm $RIcm $ZWJ ($ID | $EB | $EM); + +# LB 30b Do not break between an Emoji Base and an Emoji Modifier +$EB $CM* $EM; # # Reverse Rules. @@ -416,34 +436,36 @@ $RIcm $RIcm; !!reverse; -$CM+ $ALPlus; -$CM+ $BA; -$CM+ $BB; -$CM+ $B2; -$CM+ $CL; -$CM+ $CP; -$CM+ $EX; -$CM+ $GL; -$CM+ $HL; -$CM+ $HY; -$CM+ $H2; -$CM+ $H3; -$CM+ $ID; -$CM+ $IN; -$CM+ $IS; -$CM+ $JL; -$CM+ $JV; -$CM+ $JT; -$CM+ $NS; -$CM+ $NU; -$CM+ $OP; -$CM+ $PO; -$CM+ $PR; -$CM+ $QU; -$CM+ $RI; -$CM+ $SY; -$CM+ $WJ; -$CM+; +^$CM+ $ALPlus; +^$CM+ $BA; +^$CM+ $BB; +^$CM+ $B2; +^$CM+ $CL; +^$CM+ $CP; +^$CM+ $EB; +^$CM+ $EM; +^$CM+ $EX; +^$CM+ $GL; +^$CM+ $HL; +^$CM+ $HY; +^$CM+ $H2; +^$CM+ $H3; +^$CM+ $ID; +^$CM+ $IN; +^$CM+ $IS; +^$CM+ $JL; +^$CM+ $JV; +^$CM+ $JT; +^$CM+ $NS; +^$CM+ $NU; +^$CM+ $OP; +^$CM+ $PO; +^$CM+ $PR; +^$CM+ $QU; +^$CM+ $RI; +^$CM+ $SY; +^$CM+ $WJ; +^$CM+; # @@ -455,14 +477,14 @@ $AL_FOLLOW $CM+ / ( [$BK $CR $LF $NL $ZW {eof}] | $SP+ $CM+ $SP | $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break. - # LB14 says OP SP* x . + # LB14 says OP SP* x . # becomes OP SP* x AL # becomes OP SP* x CM+ AL_FOLLOW # # Further note: the $AL in [$AL {eof}] is only to work around # a rule compiler bug which complains about # empty sets otherwise. - + # # Sequences of the form (shown forwards) # [CANT_CM] [CM] [PR] @@ -474,7 +496,7 @@ $AL_FOLLOW $CM+ / ( -# LB 4, 5, 5 +# LB 4, 5, 6 $LB4Breaks [$LB4NonBreaks-$CM]; $LB4Breaks $CM+ $CAN_CM; @@ -491,30 +513,37 @@ $LF $CR; # Requires an engine enhancement. # / $SP* $ZW +# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3 +# The ZWJ will look like a CM to whatever precedes it. +# +($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?; + + # LB 9,10 Combining marks. # X $CM needs to behave like X, where X is not $SP or controls. # $CM not covered by the above needs to behave like $AL # Stick together any combining sequences that don't match other rules. -$CM+ $CAN_CM; +^$CM+ $CAN_CM; # LB 11 -$CM* $WJ $CM* $CAN_CM; -$CM* $WJ [$LB8NonBreaks-$CM]; +# +$WJ $CM* $CAN_CM; +$WJ [$LB8NonBreaks-$CM]; $CANT_CM $CM* $WJ; -$CM* $CAN_CM $CM* $WJ; +$CAN_CM $CM* $WJ; # LB 12a # [^SP BA HY] x GL # -$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]]; +$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]]; # LB 12 # GL x # $CANT_CM $CM* $GL; -$CM* $CAN_CM $CM* $GL; +$CAN_CM $CM* $GL; # LB 13 @@ -535,28 +564,26 @@ $SY [$LB8NonBreaks-$CM]; # OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY) # This really wants to chain at the $CM+ (which is acting as an $AL) # except for $CM chaining being disabled. -[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP; +[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP; # LB 14 OP SP* x # -$CM* $CAN_CM $SP* $CM* $OP; +$CAN_CM $SP* $CM* $OP; $CANT_CM $SP* $CM* $OP; $AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP - - $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP; -$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP; -$SY $CM $SP+ $OP; # TODO: Experiment. Remove. + $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP; +$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP; # LB 15 -$CM* $OP $SP* $CM* $QU; +$OP $SP* $CM* $QU; # LB 16 -$CM* $NS $SP* $CM* ($CL | $CP); +$NS $SP* $CM* ($CL | $CP); # LB 17 -$CM* $B2 $SP* $CM* $B2; +$B2 $SP* $CM* $B2; # LB 18 break after spaces # Nothing explicit needed here. @@ -565,82 +592,100 @@ $CM* $B2 $SP* $CM* $B2; # # LB 19 # -$CM* $QU $CM* $CAN_CM; # . x QU -$CM* $QU $LB18NonBreaks; +$QU $CM* $CAN_CM; # . x QU +$QU $LB18NonBreaks; -$CM* $CAN_CM $CM* $QU; # QU x . +$CAN_CM $CM* $QU; # QU x . $CANT_CM $CM* $QU; - + # # LB 20 Break before and after CB. # nothing needed here. # # LB 21 -$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) +($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) -$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x . -[^$CB] $CM* $BB; # +[$LB20NonBreaks-$CM] $CM* $BB; # BB x . +[^$CB] $CM* $BB; # # LB21a [^$CB] $CM* ($HY | $BA) $CM* $HL; # LB21b (reverse) -$CM* $HL $CM* $SY; +$HL $CM* $SY; # LB 22 -$CM* $IN $CM* ($ALPlus | $HL); -$CM* $IN $CM* $EX; -$CM* $IN $CM* $ID; -$CM* $IN $CM* $IN; -$CM* $IN $CM* $NU; +$IN $CM* ($ALPlus | $HL); +$IN $CM* $EX; +$IN $CM* ($ID | $EB | $EM); +$IN $CM* $IN; +$IN $CM* $NU; # LB 23 -$CM* $PO $CM* $ID; -$CM* $NU $CM* ($ALPlus | $HL); -$CM* ($ALPlus | $HL) $CM* $NU; +$PO $CM* ($ID | $EB | $EM); +$NU $CM* ($ALPlus | $HL); +($ALPlus | $HL) $CM* $NU; # LB 24 -$CM* $ID $CM* $PR; -$CM* ($ALPlus | $HL) $CM* $PR; -$CM* ($ALPlus | $HL) $CM* $PO; +($ID | $EB | $EM) $CM* $PR; +($ALPlus | $HL) $CM* $PR; +($ALPlus | $HL) $CM* $PO; # LB 25 ($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?; # LB 26 -$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL; -$CM* ($JT | $JV) $CM* ($H2 | $JV); -$CM* $JT $CM* ($H3 | $JT); +($H3 | $H2 | $JV | $JL) $CM* $JL; +($JT | $JV) $CM* ($H2 | $JV); +$JT $CM* ($H3 | $JT); # LB 27 -$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL); -$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL); -$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; +$IN $CM* ($H3 | $H2 | $JT | $JV | $JL); +$PO $CM* ($H3 | $H2 | $JT | $JV | $JL); + ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; # LB 28 -$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL); +($ALPlus | $HL) $CM* ($ALPlus | $HL); # LB 29 -$CM* ($ALPlus | $HL) $CM* $IS; +($ALPlus | $HL) $CM* $IS; # LB 30 -$CM* $OP $CM* ($ALPlus | $HL | $NU); -$CM* ($ALPlus | $HL | $NU) $CM* $CP; +$OP $CM* ($ALPlus | $HL | $NU); +($ALPlus | $HL | $NU) $CM* $CP; # LB 30a -$CM* $RI $CM* $RI; +# Pairs of Regional Indicators. +# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs, +# the second with an even number. Stripping away the cruft they look like +# [^RI] RI / (RI RI)+ ^RI; +# [^RI] RI RI / (RI RI)+ ^RI; +# +[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; +[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; + +# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs. +$RI $CM* $RI; + +# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI". +$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL)); + + +# LB 30b Do not break between an Emoji Base and an Emoji Modifier +$EM $CM* $EB; + ## ------------------------------------------------- !!safe_reverse; # LB 9 -$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; -$CM+ $SP / .; +^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; +^$CM+ $SP / .; # LB 14 $SP+ $CM* $OP; @@ -661,6 +706,9 @@ $CM* ($HY | $BA) $CM* $HL; ($CM* ($IS | $SY))+ $CM* $NU; ($CL | $CP) $CM* ($NU | $IS | $SY); +# LB 30 +($CM* $RI)+; + # For dictionary-based break $dictionary $dictionary; @@ -677,6 +725,6 @@ $dictionary $dictionary; # turn off rule chaining. We don't want to move more # than necessary. # -[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary]; +^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary]; $dictionary $dictionary; diff --git a/icu4c/source/data/brkitr/rules/line_normal_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_cj.txt index 908a41017fc..890728d776d 100644 --- a/icu4c/source/data/brkitr/rules/line_normal_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_normal_cj.txt @@ -1,13 +1,17 @@ -# Copyright (c) 2002-2015 International Business Machines Corporation and +# Copyright (c) 2002-2016 International Business Machines Corporation and # others. All Rights Reserved. # # file: line_normal_cj.txt # # Line Breaking Rules -# Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 34 for Unicode 8.0 +# Implement default line breaking as defined by +# Unicode Standard Annex #14 Revision 35 for Unicode 8.0 # http://www.unicode.org/reports/tr14/ -# tailored as noted in 2nd paragraph below.. +# +# Includes the Emoji breaking proposals from Unicode L2/16-011R3. +# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf +# +# tailored as noted in 2nd paragraph below. # # TODO: Rule LB 8 remains as it was in Unicode 5.2 # This is only because of a limitation of ICU break engine implementation, @@ -24,8 +28,6 @@ # !!chain; -!!LBCMNoChain; - !!lookAheadHardBreak; # @@ -63,8 +65,13 @@ # See rule LB 19 for an example. # +# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available. + +$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918]; +$EM = [\U0001F3FB-\U0001F3FF]; + $AI = [:LineBreak = Ambiguous:]; -$AL = [:LineBreak = Alphabetic:]; +$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]]; $BAX = [\u2010 \u2013]; $BA = [[:LineBreak = Break_After:] - $BAX]; $BB = [:LineBreak = Break_Before:]; @@ -73,7 +80,7 @@ $B2 = [:LineBreak = Break_Both:]; $CB = [:LineBreak = Contingent_Break:]; $CJ = [:LineBreak = Conditional_Japanese_Starter:]; $CL = [:LineBreak = Close_Punctuation:]; -$CM = [:LineBreak = Combining_Mark:]; +$CM = [[:LineBreak = Combining_Mark:] \u200d]; $CP = [:LineBreak = Close_Parenthesis:]; $CR = [:LineBreak = Carriage_Return:]; $EX = [:LineBreak = Exclamation:]; @@ -82,7 +89,7 @@ $HL = [:LineBreak = Hebrew_Letter:]; $HY = [:LineBreak = Hyphen:]; $H2 = [:LineBreak = H2:]; $H3 = [:LineBreak = H3:]; -$ID = [[:LineBreak = Ideographic:] $CJ]; +$ID = [[:LineBreak = Ideographic:] $CJ [\u2764] - $EB]; $IN = [:LineBreak = Inseperable:]; $IS = [:LineBreak = Infix_Numeric:]; $JL = [:LineBreak = JL:]; @@ -105,6 +112,7 @@ $SY = [:LineBreak = Break_Symbols:]; $WJ = [:LineBreak = Word_Joiner:]; $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; +$ZWJ = [\u200d]; # Dictionary character set, for triggering language-based break engines. Currently # limited to LineBreak=Complex_Context. Note that this set only works in Unicode @@ -138,7 +146,6 @@ $HLcm = $HL $CM*; $HYcm = $HY $CM*; $H2cm = $H2 $CM*; $H3cm = $H3 $CM*; -$IDcm = $ID $CM*; $INcm = $IN $CM*; $IScm = $IS $CM*; $JLcm = $JL $CM*; @@ -169,6 +176,8 @@ $BB $CM+; $B2 $CM+; $CL $CM+; $CP $CM+; +$EB $CM+; +$EM $CM+; $EX $CM+; $GL $CM+; $HL $CM+; @@ -218,7 +227,7 @@ $AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM]; # Rule LB 4, 5 Mandatory (Hard) breaks. # $LB4Breaks = [$BK $CR $LF $NL]; -$LB4NonBreaks = [^$BK $CR $LF $NL]; +$LB4NonBreaks = [^$BK $CR $LF $NL $CM]; $CR $LF {100}; # @@ -226,13 +235,13 @@ $CR $LF {100}; # $LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks. $CAN_CM $CM* $LB4Breaks {100}; -$CM+ $LB4Breaks {100}; +^$CM+ $LB4Breaks {100}; # LB 7 x SP # x ZW $LB4NonBreaks [$SP $ZW]; $CAN_CM $CM* [$SP $ZW]; -$CM+ [$SP $ZW]; +^$CM+ [$SP $ZW]; # # LB 8 Break after zero width space @@ -243,20 +252,23 @@ $CM+ [$SP $ZW]; $LB8Breaks = [$LB4Breaks $ZW]; $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; +# LB 8a ZWJ x ID Emoji proposal. +# +$ZWJ ($ID | $EB | $EM); -# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL -# $CM not covered by the above needs to behave like $AL +# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL +# $CM not covered by the above needs to behave like $AL # See definition of $CAN_CM. $CAN_CM $CM+; # Stick together any combining sequences that don't match other rules. -$CM+; +^$CM+; # # LB 11 Do not break before or after WORD JOINER & related characters. # $CAN_CM $CM* $WJcm; $LB8NonBreaks $WJcm; -$CM+ $WJcm; +^$CM+ $WJcm; $WJcm $CANT_CM; $WJcm $CAN_CM $CM*; @@ -267,13 +279,13 @@ $WJcm $CAN_CM $CM*; # $GLcm $CAN_CM $CM*; $GLcm $CANT_CM; - + # # LB 12a Do not break before NBSP and related characters ... # [^SP BA HY] x GL # [[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GLcm; -$CM+ GLcm; +^$CM+ $GLcm; @@ -282,23 +294,23 @@ $CM+ GLcm; # $LB8NonBreaks $CL; $CAN_CM $CM* $CL; -$CM+ $CL; # by rule 10, stand-alone CM behaves as AL +^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $CP; $CAN_CM $CM* $CP; -$CM+ $CP; # by rule 10, stand-alone CM behaves as AL +^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $EX; $CAN_CM $CM* $EX; -$CM+ $EX; # by rule 10, stand-alone CM behaves as AL +^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $IS; $CAN_CM $CM* $IS; -$CM+ $IS; # by rule 10, stand-alone CM behaves as AL +^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $SY; $CAN_CM $CM* $SY; -$CM+ $SY; # by rule 10, stand-alone CM behaves as AL +^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL # @@ -330,12 +342,10 @@ $LB18Breaks = [$LB8Breaks $SP]; # LB 19 # x QU $LB18NonBreaks $CM* $QUcm; -$CM+ $QUcm; +^$CM+ $QUcm; # QU x $QUcm .?; -$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc. - # TODO: I don't think this rule is needed. # LB 20 @@ -348,14 +358,15 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # BB x # # DO allow breaks here before $BAXcm and $NSXcm, so don't include them -$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); +$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); +^$CM+ ($BAcm | $HYcm | $NScm); $BBcm [^$CB]; # $BB x $BBcm $LB20NonBreaks $CM*; # LB 21a Don't break after Hebrew + Hyphen # HL (HY | BA) x -# +# $HLcm ($HYcm | $BAcm | $BAXcm) [^$CB]?; # LB 21b (forward) Don't break between SY and HL @@ -364,25 +375,25 @@ $SYcm $HLcm; # LB 22 ($ALcm | $HLcm) $INcm; -$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL +^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL $EXcm $INcm; -$IDcm $INcm; +($ID | $EB | $EM) $CM* $INcm; $INcm $INcm; $NUcm $INcm; # $LB 23 -$IDcm $POcm; +($ID | $EB | $EM) $CM* $POcm; $ALcm $NUcm; # includes $LB19 $HLcm $NUcm; -$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL +^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL $NUcm $ALcm; $NUcm $HLcm; # # LB 24 # -$PRcm $IDcm; +$PRcm ($ID | $EB | $EM); $PRcm ($ALcm | $HLcm); $POcm ($ALcm | $HLcm); @@ -406,18 +417,27 @@ $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm); # LB 28 Do not break between alphabetics # ($ALcm | $HLcm) ($ALcm | $HLcm); -$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL +^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL # LB 29 $IScm ($ALcm | $HLcm); # LB 30 ($ALcm | $HLcm | $NUcm) $OPcm; -$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL. +^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL. $CPcm ($ALcm | $HLcm | $NUcm); -# LB 30a Do not break between regional indicators. -$RIcm $RIcm; +# LB 30a Do not break between regional indicators. Break after pairs of them. +# Tricky interaction with LB8a: ZWJ x ID +$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}]; +$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}]; +$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}]; + +$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}]; +$RIcm $RIcm $ZWJ ($ID | $EB | $EM); + +# LB 30b Do not break between an Emoji Base and an Emoji Modifier +$EB $CM* $EM; # # Reverse Rules. @@ -426,36 +446,38 @@ $RIcm $RIcm; !!reverse; -$CM+ $ALPlus; -$CM+ $BA; -$CM+ $BAX; -$CM+ $BB; -$CM+ $B2; -$CM+ $CL; -$CM+ $CP; -$CM+ $EX; -$CM+ $GL; -$CM+ $HL; -$CM+ $HY; -$CM+ $H2; -$CM+ $H3; -$CM+ $ID; -$CM+ $IN; -$CM+ $IS; -$CM+ $JL; -$CM+ $JV; -$CM+ $JT; -$CM+ $NS; -$CM+ $NSX; -$CM+ $NU; -$CM+ $OP; -$CM+ $PO; -$CM+ $PR; -$CM+ $QU; -$CM+ $RI; -$CM+ $SY; -$CM+ $WJ; -$CM+; +^$CM+ $ALPlus; +^$CM+ $BA; +^$CM+ $BAX; +^$CM+ $BB; +^$CM+ $B2; +^$CM+ $CL; +^$CM+ $CP; +^$CM+ $EB; +^$CM+ $EM; +^$CM+ $EX; +^$CM+ $GL; +^$CM+ $HL; +^$CM+ $HY; +^$CM+ $H2; +^$CM+ $H3; +^$CM+ $ID; +^$CM+ $IN; +^$CM+ $IS; +^$CM+ $JL; +^$CM+ $JV; +^$CM+ $JT; +^$CM+ $NS; +^$CM+ $NSX; +^$CM+ $NU; +^$CM+ $OP; +^$CM+ $PO; +^$CM+ $PR; +^$CM+ $QU; +^$CM+ $RI; +^$CM+ $SY; +^$CM+ $WJ; +^$CM+; # @@ -467,14 +489,14 @@ $AL_FOLLOW $CM+ / ( [$BK $CR $LF $NL $ZW {eof}] | $SP+ $CM+ $SP | $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break. - # LB14 says OP SP* x . + # LB14 says OP SP* x . # becomes OP SP* x AL # becomes OP SP* x CM+ AL_FOLLOW # # Further note: the $AL in [$AL {eof}] is only to work around # a rule compiler bug which complains about # empty sets otherwise. - + # # Sequences of the form (shown forwards) # [CANT_CM] [CM] [PR] @@ -486,7 +508,7 @@ $AL_FOLLOW $CM+ / ( -# LB 4, 5, 5 +# LB 4, 5, 6 $LB4Breaks [$LB4NonBreaks-$CM]; $LB4Breaks $CM+ $CAN_CM; @@ -503,30 +525,37 @@ $LF $CR; # Requires an engine enhancement. # / $SP* $ZW +# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3 +# The ZWJ will look like a CM to whatever precedes it. +# +($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?; + + # LB 9,10 Combining marks. # X $CM needs to behave like X, where X is not $SP or controls. # $CM not covered by the above needs to behave like $AL # Stick together any combining sequences that don't match other rules. -$CM+ $CAN_CM; +^$CM+ $CAN_CM; # LB 11 -$CM* $WJ $CM* $CAN_CM; -$CM* $WJ [$LB8NonBreaks-$CM]; +# +$WJ $CM* $CAN_CM; +$WJ [$LB8NonBreaks-$CM]; $CANT_CM $CM* $WJ; -$CM* $CAN_CM $CM* $WJ; +$CAN_CM $CM* $WJ; # LB 12a # [^SP BA HY] x GL # -$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]]; +$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]]; # LB 12 # GL x # $CANT_CM $CM* $GL; -$CM* $CAN_CM $CM* $GL; +$CAN_CM $CM* $GL; # LB 13 @@ -547,29 +576,27 @@ $SY [$LB8NonBreaks-$CM]; # OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY) # This really wants to chain at the $CM+ (which is acting as an $AL) # except for $CM chaining being disabled. -[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP; +[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP; # LB 14 OP SP* x # -$CM* $CAN_CM $SP* $CM* $OP; +$CAN_CM $SP* $CM* $OP; $CANT_CM $SP* $CM* $OP; $AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP - - $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP; -$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP; -$SY $CM $SP+ $OP; # TODO: Experiment. Remove. + $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP; +$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP; # LB 15 -$CM* $OP $SP* $CM* $QU; +$OP $SP* $CM* $QU; # LB 16 # Don't include $NSX here -$CM* $NS $SP* $CM* ($CL | $CP); +$NS $SP* $CM* ($CL | $CP); # LB 17 -$CM* $B2 $SP* $CM* $B2; +$B2 $SP* $CM* $B2; # LB 18 break after spaces # Nothing explicit needed here. @@ -578,13 +605,13 @@ $CM* $B2 $SP* $CM* $B2; # # LB 19 # -$CM* $QU $CM* $CAN_CM; # . x QU -$CM* $QU $LB18NonBreaks; +$QU $CM* $CAN_CM; # . x QU +$QU $LB18NonBreaks; -$CM* $CAN_CM $CM* $QU; # QU x . +$CAN_CM $CM* $QU; # QU x . $CANT_CM $CM* $QU; - + # # LB 20 Break before and after CB. # nothing needed here. @@ -592,69 +619,87 @@ $CM* $CAN_CM $CM* $QU; # QU x . # LB 21 # Don't include $BAX or $NSX here -$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) +($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) -$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x . -[^$CB] $CM* $BB; # +[$LB20NonBreaks-$CM] $CM* $BB; # BB x . +[^$CB] $CM* $BB; # -# LB21a -[^$CB]? $CM* ($HY | $BA | $BAX) $CM* $HL; +# LB21a Don't break after Hebrew + Hyphen. +([^$CB] $CM*)? ($HY | $BA | $BAX) $CM* $HL; # LB21b (reverse) -$CM* $HL $CM* $SY; +$HL $CM* $SY; # LB 22 -$CM* $IN $CM* ($ALPlus | $HL); -$CM* $IN $CM* $EX; -$CM* $IN $CM* $ID; -$CM* $IN $CM* $IN; -$CM* $IN $CM* $NU; +$IN $CM* ($ALPlus | $HL); +$IN $CM* $EX; +$IN $CM* ($ID | $EB | $EM); +$IN $CM* $IN; +$IN $CM* $NU; # LB 23 -$CM* $PO $CM* $ID; -$CM* $NU $CM* ($ALPlus | $HL); -$CM* ($ALPlus | $HL) $CM* $NU; +$PO $CM* ($ID | $EB | $EM); +$NU $CM* ($ALPlus | $HL); +($ALPlus | $HL) $CM* $NU; # LB 24 -$CM* $ID $CM* $PR; -$CM* ($ALPlus | $HL) $CM* $PR; -$CM* ($ALPlus | $HL) $CM* $PO; +($ID | $EB | $EM) $CM* $PR; +($ALPlus | $HL) $CM* $PR; +($ALPlus | $HL) $CM* $PO; # LB 25 ($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?; # LB 26 -$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL; -$CM* ($JT | $JV) $CM* ($H2 | $JV); -$CM* $JT $CM* ($H3 | $JT); +($H3 | $H2 | $JV | $JL) $CM* $JL; +($JT | $JV) $CM* ($H2 | $JV); +$JT $CM* ($H3 | $JT); # LB 27 -$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL); -$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL); -$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; +$IN $CM* ($H3 | $H2 | $JT | $JV | $JL); +$PO $CM* ($H3 | $H2 | $JT | $JV | $JL); + ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; # LB 28 -$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL); +($ALPlus | $HL) $CM* ($ALPlus | $HL); # LB 29 -$CM* ($ALPlus | $HL) $CM* $IS; +($ALPlus | $HL) $CM* $IS; # LB 30 -$CM* $OP $CM* ($ALPlus | $HL | $NU); -$CM* ($ALPlus | $HL | $NU) $CM* $CP; +$OP $CM* ($ALPlus | $HL | $NU); +($ALPlus | $HL | $NU) $CM* $CP; # LB 30a -$CM* $RI $CM* $RI; +# Pairs of Regional Indicators. +# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs, +# the second with an even number. Stripping away the cruft they look like +# [^RI] RI / (RI RI)+ ^RI; +# [^RI] RI RI / (RI RI)+ ^RI; +# +[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; +[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; + +# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs. +$RI $CM* $RI; + +# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI". +$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL)); + + +# LB 30b Do not break between an Emoji Base and an Emoji Modifier +$EM $CM* $EB; + ## ------------------------------------------------- !!safe_reverse; # LB 9 -$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; -$CM+ $SP / .; +^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; +^$CM+ $SP / .; # LB 14 $SP+ $CM* $OP; @@ -675,6 +720,9 @@ $CM* ($HY | $BA | $BAX) $CM* $HL; ($CM* ($IS | $SY))+ $CM* $NU; ($CL | $CP) $CM* ($NU | $IS | $SY); +# LB 30 +($CM* $RI)+; + # For dictionary-based break $dictionary $dictionary; @@ -691,6 +739,6 @@ $dictionary $dictionary; # turn off rule chaining. We don't want to move more # than necessary. # -[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $BAX $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $BAX $dictionary]; +^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $BAX $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $BAX $RI $ZWJ $dictionary]; $dictionary $dictionary; diff --git a/icu4c/source/data/brkitr/rules/line_normal_fi.txt b/icu4c/source/data/brkitr/rules/line_normal_fi.txt index 626fdbe9776..33532e0a380 100644 --- a/icu4c/source/data/brkitr/rules/line_normal_fi.txt +++ b/icu4c/source/data/brkitr/rules/line_normal_fi.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2002-2015 International Business Machines Corporation and +# Copyright (c) 2002-2016 International Business Machines Corporation and # others. All Rights Reserved. # # file: line_normal_fi.txt @@ -269,7 +269,7 @@ $GLcm $CANT_CM; # [^SP BA HY] x GL # [[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GLcm; -$CM+ GLcm; +$CM+ $GLcm; diff --git a/icu4c/source/data/brkitr/rules/word.txt b/icu4c/source/data/brkitr/rules/word.txt index e7ea67cfeef..3e40eb6636c 100644 --- a/icu4c/source/data/brkitr/rules/word.txt +++ b/icu4c/source/data/brkitr/rules/word.txt @@ -1,12 +1,13 @@ # -# Copyright (C) 2002-2015, International Business Machines Corporation +# Copyright (C) 2002-2016, International Business Machines Corporation # and others. All Rights Reserved. # # file: word.txt # # ICU Word Break Rules # See Unicode Standard Annex #29. -# These rules are based on UAX #29 Revision 22 for Unicode Version 6.3 +# These rules are based on UAX #29 Revision 27 for Unicode Version 8.0 +# with additions from L2/16-011R3 for Emoji sequences. # # Note: Updates to word.txt will usually need to be merged into # word_POSIX.txt also. @@ -24,12 +25,17 @@ # Character Class Definitions. # +$E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918]; +$E_Modifier = [\U0001F3FB-\U0001F3FF]; +$ZWJ = [\u200D]; +$GAZ = [\U0001F466-\U0001F469\U0001F48B\U0001F5E8\u2764]; + $CR = [\p{Word_Break = CR}]; $LF = [\p{Word_Break = LF}]; -$Newline = [\p{Word_Break = Newline}]; -$Extend = [\p{Word_Break = Extend}]; +$Newline = [\p{Word_Break = Newline} ]; +$Extend = [[\p{Word_Break = Extend}][:Block=Tags:]]; $Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; -$Format = [\p{Word_Break = Format}]; +$Format = [[\p{Word_Break = Format}] - [:Block=Tags:]]; $Katakana = [\p{Word_Break = Katakana}]; $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; $ALetter = [\p{Word_Break = ALetter}]; @@ -66,21 +72,21 @@ $ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; # except when they appear at the beginning of a region of text. # # TODO: check if handling of katakana in dictionary makes rules incorrect/void -$KatakanaEx = $Katakana ($Extend | $Format)*; -$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format)*; -$ALetterEx = $ALetterPlus ($Extend | $Format)*; -$Single_QuoteEx = $Single_Quote ($Extend | $Format)*; -$Double_QuoteEx = $Double_Quote ($Extend | $Format)*; -$MidNumLetEx = $MidNumLet ($Extend | $Format)*; -$MidLetterEx = $MidLetter ($Extend | $Format)*; -$MidNumEx = $MidNum ($Extend | $Format)*; -$NumericEx = $Numeric ($Extend | $Format)*; -$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; -$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format)*; +$KatakanaEx = $Katakana ($Extend | $Format | $ZWJ)*; +$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format | $ZWJ)*; +$ALetterEx = $ALetterPlus ($Extend | $Format | $ZWJ)*; +$Single_QuoteEx = $Single_Quote ($Extend | $Format | $ZWJ)*; +$Double_QuoteEx = $Double_Quote ($Extend | $Format | $ZWJ)*; +$MidNumLetEx = $MidNumLet ($Extend | $Format | $ZWJ)*; +$MidLetterEx = $MidLetter ($Extend | $Format | $ZWJ)*; +$MidNumEx = $MidNum ($Extend | $Format | $ZWJ)*; +$NumericEx = $Numeric ($Extend | $Format | $ZWJ)*; +$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format | $ZWJ)*; +$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format | $ZWJ)*; $Ideographic = [\p{Ideographic}]; -$HiraganaEx = $Hiragana ($Extend | $Format)*; -$IdeographicEx = $Ideographic ($Extend | $Format)*; +$HiraganaEx = $Hiragana ($Extend | $Format | $ZWJ)*; +$IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*; ## ------------------------------------------------- @@ -91,12 +97,17 @@ $IdeographicEx = $Ideographic ($Extend | $Format)*; # $CR $LF; +# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed. +# +$ZWJ $GAZ; + + # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning # of a region of Text. The rule here comes into play when the start of text # begins with a group of Format chars, or with a "word" consisting of a single # char that is not in any of the listed word break categories followed by # format char(s), or is not a CJK dictionary character. -[^$CR $LF $Newline]? ($Extend | $Format)+; +[^$CR $LF $Newline]? ($Extend | $Format | $ZWJ)+; $NumericEx {100}; $ALetterEx {200}; @@ -106,6 +117,10 @@ $KatakanaEx {400}; # note: these status values override those from rule 5 $HiraganaEx {400}; # by virtue of being numerically larger. $IdeographicEx {400}; # +$E_Base ($Extend | $Format | $ZWJ)*; +$E_Modifier ($Extend | $Format | $ZWJ)*; +$GAZ ($Extend | $Format | $ZWJ)*; + # # rule 5 # Do not break between most letters. @@ -157,36 +172,48 @@ $ExtendNumLetEx $NumericEx {100}; # (13b) $ExtendNumLetEx $KatakanaEx {400}; # (13b) # rule 13c - -$Regional_IndicatorEx $Regional_IndicatorEx; +# Pairs of Regional Indicators stay together. +# With rule chaining disabled by ^, this rule will match exactly two of them. +# No other rule begins with a Regional_Indicator, so chaining cannot extend the match. +# +^$Regional_IndicatorEx $Regional_IndicatorEx; # special handling for CJK characters: chain for later dictionary segmentation $HangulSyllable $HangulSyllable {200}; $KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found +# rule 13d +# E_Base x E_Modifier +# +($E_Base | $GAZ) ($Format | $Extend | $ZWJ)* $E_Modifier; + ## ------------------------------------------------- !!reverse; -$BackHebrew_LetterEx = ($Format | $Extend)* $Hebrew_Letter; -$BackALetterEx = ($Format | $Extend)* $ALetterPlus; -$BackSingle_QuoteEx = ($Format | $Extend)* $Single_Quote; -$BackDouble_QuoteEx = ($Format | $Extend)* $Double_Quote; -$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet; -$BackNumericEx = ($Format | $Extend)* $Numeric; -$BackMidNumEx = ($Format | $Extend)* $MidNum; -$BackMidLetterEx = ($Format | $Extend)* $MidLetter; -$BackKatakanaEx = ($Format | $Extend)* $Katakana; -$BackHiraganaEx = ($Format | $Extend)* $Hiragana; -$BackExtendNumLetEx = ($Format | $Extend)* $ExtendNumLet; -$BackRegional_IndicatorEx = ($Format | $Extend)* $Regional_Indicator; +$BackHebrew_LetterEx = ($Format | $Extend | $ZWJ)* $Hebrew_Letter; +$BackALetterEx = ($Format | $Extend | $ZWJ)* $ALetterPlus; +$BackSingle_QuoteEx = ($Format | $Extend | $ZWJ)* $Single_Quote; +$BackDouble_QuoteEx = ($Format | $Extend | $ZWJ)* $Double_Quote; +$BackMidNumLetEx = ($Format | $Extend | $ZWJ)* $MidNumLet; +$BackNumericEx = ($Format | $Extend | $ZWJ)* $Numeric; +$BackMidNumEx = ($Format | $Extend | $ZWJ)* $MidNum; +$BackMidLetterEx = ($Format | $Extend | $ZWJ)* $MidLetter; +$BackKatakanaEx = ($Format | $Extend | $ZWJ)* $Katakana; +$BackHiraganaEx = ($Format | $Extend | $ZWJ)* $Hiragana; +$BackExtendNumLetEx = ($Format | $Extend | $ZWJ)* $ExtendNumLet; +$BackRegional_IndicatorEx = ($Format | $Extend | $ZWJ)* $Regional_Indicator; # rule 3 $LF $CR; +# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed. +# +$GAZ $ZWJ; + # rule 4 -($Format | $Extend)* [^$CR $LF $Newline]?; +($Format | $Extend | $ZWJ)* [^$CR $LF $Newline]?; # rule 5 @@ -229,18 +256,32 @@ $BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $B # rule 13c -$BackRegional_IndicatorEx $BackRegional_IndicatorEx; +^$BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)* + ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}]; +^$BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)* + ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}]; + +$GAZ $ZWJ $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)* + ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}]; +$GAZ $ZWJ $BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)* + ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}]; # special handling for CJK characters: chain for later dictionary segmentation $HangulSyllable $HangulSyllable; $KanaKanji $KanaKanji; #different rule status if both kanji and kana found +# rule 13d + +$E_Modifier ($Format | $Extend | $ZWJ)* ($E_Base | $GAZ); + + + ## ------------------------------------------------- !!safe_reverse; # rule 3 -($Extend | $Format)+ .?; +($Extend | $Format | $ZWJ)+ .?; # rule 6 ($MidLetter | $MidNumLet | $Single_Quote) ($BackALetterEx | $BackHebrew_LetterEx); @@ -252,6 +293,9 @@ $Double_Quote $BackHebrew_LetterEx; # rule 11 ($MidNum | $MidNumLet | $Single_Quote) $BackNumericEx; +# rule 13c +$BackRegional_IndicatorEx*; + # For dictionary-based break $dictionary $dictionary; @@ -260,7 +304,7 @@ $dictionary $dictionary; !!safe_forward; # rule 4 -($Extend | $Format)+ .?; +($Extend | $Format | $ZWJ)+ .?; # rule 6 ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx); @@ -271,5 +315,8 @@ $Double_QuoteEx $Hebrew_LetterEx; # rule 11 ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx; +# rule 13c +$Regional_IndicatorEx*; + # For dictionary-based break $dictionary $dictionary; diff --git a/icu4c/source/data/brkitr/rules/word_POSIX.txt b/icu4c/source/data/brkitr/rules/word_POSIX.txt index df305db8104..33ffe00eef9 100644 --- a/icu4c/source/data/brkitr/rules/word_POSIX.txt +++ b/icu4c/source/data/brkitr/rules/word_POSIX.txt @@ -1,12 +1,13 @@ # -# Copyright (C) 2002-2015, International Business Machines Corporation +# Copyright (C) 2002-2016, International Business Machines Corporation # and others. All Rights Reserved. # # file: word_POSIX.txt # # ICU Word Break Rules, POSIX locale. # See Unicode Standard Annex #29. -# These rules are based on UAX #29 Revision 22 for Unicode Version 6.3 +# These rules are based on UAX #29 Revision 27 for Unicode Version 8.0 +# with additions from L2/16-011R3 for Emoji sequences. # # Note: Updates to word.txt will usually need to be merged into # word_POSIX.txt also. @@ -24,12 +25,17 @@ # Character Class Definitions. # +$E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918]; +$E_Modifier = [\U0001F3FB-\U0001F3FF]; +$ZWJ = [\u200D]; +$GAZ = [\U0001F466-\U0001F469\U0001F48B\U0001F5E8\u2764]; + $CR = [\p{Word_Break = CR}]; $LF = [\p{Word_Break = LF}]; -$Newline = [\p{Word_Break = Newline}]; -$Extend = [\p{Word_Break = Extend}]; +$Newline = [\p{Word_Break = Newline} ]; +$Extend = [[\p{Word_Break = Extend}][:Block=Tags:]]; $Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; -$Format = [\p{Word_Break = Format}]; +$Format = [[\p{Word_Break = Format}] - [:Block=Tags:]]; $Katakana = [\p{Word_Break = Katakana}]; $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; $ALetter = [\p{Word_Break = ALetter}]; @@ -50,7 +56,7 @@ $Hiragana = [:Hiragana:]; # 5.0 or later as the definition of Complex_Context was corrected to include all # characters requiring dictionary break. -$Control = [\p{Grapheme_Cluster_Break = Control}]; +$Control = [\p{Grapheme_Cluster_Break = Control}]; $HangulSyllable = [\uac00-\ud7a3]; $ComplexContext = [:LineBreak = Complex_Context:]; $KanaKanji = [$Han $Hiragana $Katakana]; @@ -62,25 +68,25 @@ $ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; # -# Rules 4 Ignore Format and Extend characters, +# Rules 4 Ignore Format and Extend characters, # except when they appear at the beginning of a region of text. # # TODO: check if handling of katakana in dictionary makes rules incorrect/void -$KatakanaEx = $Katakana ($Extend | $Format)*; -$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format)*; -$ALetterEx = $ALetterPlus ($Extend | $Format)*; -$Single_QuoteEx = $Single_Quote ($Extend | $Format)*; -$Double_QuoteEx = $Double_Quote ($Extend | $Format)*; -$MidNumLetEx = $MidNumLet ($Extend | $Format)*; -$MidLetterEx = $MidLetter ($Extend | $Format)*; -$MidNumEx = $MidNum ($Extend | $Format)*; -$NumericEx = $Numeric ($Extend | $Format)*; -$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; -$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format)*; +$KatakanaEx = $Katakana ($Extend | $Format | $ZWJ)*; +$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format | $ZWJ)*; +$ALetterEx = $ALetterPlus ($Extend | $Format | $ZWJ)*; +$Single_QuoteEx = $Single_Quote ($Extend | $Format | $ZWJ)*; +$Double_QuoteEx = $Double_Quote ($Extend | $Format | $ZWJ)*; +$MidNumLetEx = $MidNumLet ($Extend | $Format | $ZWJ)*; +$MidLetterEx = $MidLetter ($Extend | $Format | $ZWJ)*; +$MidNumEx = $MidNum ($Extend | $Format | $ZWJ)*; +$NumericEx = $Numeric ($Extend | $Format | $ZWJ)*; +$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format | $ZWJ)*; +$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format | $ZWJ)*; $Ideographic = [\p{Ideographic}]; -$HiraganaEx = $Hiragana ($Extend | $Format)*; -$IdeographicEx = $Ideographic ($Extend | $Format)*; +$HiraganaEx = $Hiragana ($Extend | $Format | $ZWJ)*; +$IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*; ## ------------------------------------------------- @@ -91,12 +97,17 @@ $IdeographicEx = $Ideographic ($Extend | $Format)*; # $CR $LF; +# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed. +# +$ZWJ $GAZ; + + # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning # of a region of Text. The rule here comes into play when the start of text # begins with a group of Format chars, or with a "word" consisting of a single # char that is not in any of the listed word break categories followed by # format char(s), or is not a CJK dictionary character. -[^$CR $LF $Newline]? ($Extend | $Format)+; +[^$CR $LF $Newline]? ($Extend | $Format | $ZWJ)+; $NumericEx {100}; $ALetterEx {200}; @@ -106,6 +117,10 @@ $KatakanaEx {400}; # note: these status values override those from rule 5 $HiraganaEx {400}; # by virtue of being numerically larger. $IdeographicEx {400}; # +$E_Base ($Extend | $Format | $ZWJ)*; +$E_Modifier ($Extend | $Format | $ZWJ)*; +$GAZ ($Extend | $Format | $ZWJ)*; + # # rule 5 # Do not break between most letters. @@ -133,7 +148,7 @@ $NumericEx $NumericEx {100}; $NumericEx ($ALetterEx | $Hebrew_LetterEx) {200}; -# rule 11 and 12 +# rule 11 and 12 $NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100}; @@ -157,36 +172,48 @@ $ExtendNumLetEx $NumericEx {100}; # (13b) $ExtendNumLetEx $KatakanaEx {400}; # (13b) # rule 13c - -$Regional_IndicatorEx $Regional_IndicatorEx; +# Pairs of Regional Indicators stay together. +# With rule chaining disabled by ^, this rule will match exactly two of them. +# No other rule begins with a Regional_Indicator, so chaining cannot extend the match. +# +^$Regional_IndicatorEx $Regional_IndicatorEx; # special handling for CJK characters: chain for later dictionary segmentation $HangulSyllable $HangulSyllable {200}; -$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found +$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found + +# rule 13d +# E_Base x E_Modifier +# +($E_Base | $GAZ) ($Format | $Extend | $ZWJ)* $E_Modifier; ## ------------------------------------------------- !!reverse; -$BackHebrew_LetterEx = ($Format | $Extend)* $Hebrew_Letter; -$BackALetterEx = ($Format | $Extend)* $ALetterPlus; -$BackSingle_QuoteEx = ($Format | $Extend)* $Single_Quote; -$BackDouble_QuoteEx = ($Format | $Extend)* $Double_Quote; -$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet; -$BackNumericEx = ($Format | $Extend)* $Numeric; -$BackMidNumEx = ($Format | $Extend)* $MidNum; -$BackMidLetterEx = ($Format | $Extend)* $MidLetter; -$BackKatakanaEx = ($Format | $Extend)* $Katakana; -$BackHiraganaEx = ($Format | $Extend)* $Hiragana; -$BackExtendNumLetEx = ($Format | $Extend)* $ExtendNumLet; -$BackRegional_IndicatorEx = ($Format | $Extend)* $Regional_Indicator; +$BackHebrew_LetterEx = ($Format | $Extend | $ZWJ)* $Hebrew_Letter; +$BackALetterEx = ($Format | $Extend | $ZWJ)* $ALetterPlus; +$BackSingle_QuoteEx = ($Format | $Extend | $ZWJ)* $Single_Quote; +$BackDouble_QuoteEx = ($Format | $Extend | $ZWJ)* $Double_Quote; +$BackMidNumLetEx = ($Format | $Extend | $ZWJ)* $MidNumLet; +$BackNumericEx = ($Format | $Extend | $ZWJ)* $Numeric; +$BackMidNumEx = ($Format | $Extend | $ZWJ)* $MidNum; +$BackMidLetterEx = ($Format | $Extend | $ZWJ)* $MidLetter; +$BackKatakanaEx = ($Format | $Extend | $ZWJ)* $Katakana; +$BackHiraganaEx = ($Format | $Extend | $ZWJ)* $Hiragana; +$BackExtendNumLetEx = ($Format | $Extend | $ZWJ)* $ExtendNumLet; +$BackRegional_IndicatorEx = ($Format | $Extend | $ZWJ)* $Regional_Indicator; # rule 3 $LF $CR; +# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed. +# +$GAZ $ZWJ; + # rule 4 -($Format | $Extend)* [^$CR $LF $Newline]?; +($Format | $Extend | $ZWJ)* [^$CR $LF $Newline]?; # rule 5 @@ -225,22 +252,36 @@ $BackKatakanaEx $BackKatakanaEx; # rules 13 a/b # $BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx); -($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx; +($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx; # rule 13c -$BackRegional_IndicatorEx $BackRegional_IndicatorEx; +^$BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)* + ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}]; +^$BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)* + ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}]; + +$GAZ $ZWJ $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)* + ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}]; +$GAZ $ZWJ $BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)* + ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}]; # special handling for CJK characters: chain for later dictionary segmentation $HangulSyllable $HangulSyllable; $KanaKanji $KanaKanji; #different rule status if both kanji and kana found +# rule 13d + +$E_Modifier ($Format | $Extend | $ZWJ)* ($E_Base | $GAZ); + + + ## ------------------------------------------------- !!safe_reverse; # rule 3 -($Extend | $Format)+ .?; +($Extend | $Format | $ZWJ)+ .?; # rule 6 ($MidLetter | $MidNumLet | $Single_Quote) ($BackALetterEx | $BackHebrew_LetterEx); @@ -252,6 +293,9 @@ $Double_Quote $BackHebrew_LetterEx; # rule 11 ($MidNum | $MidNumLet | $Single_Quote) $BackNumericEx; +# rule 13c +$BackRegional_IndicatorEx*; + # For dictionary-based break $dictionary $dictionary; @@ -260,7 +304,7 @@ $dictionary $dictionary; !!safe_forward; # rule 4 -($Extend | $Format)+ .?; +($Extend | $Format | $ZWJ)+ .?; # rule 6 ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx); @@ -271,5 +315,8 @@ $Double_QuoteEx $Hebrew_LetterEx; # rule 11 ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx; +# rule 13c +$Regional_IndicatorEx*; + # For dictionary-based break $dictionary $dictionary; diff --git a/icu4c/source/test/cintltst/cbiapts.c b/icu4c/source/test/cintltst/cbiapts.c index 62ad4626480..b07c97ae61e 100644 --- a/icu4c/source/test/cintltst/cbiapts.c +++ b/icu4c/source/test/cintltst/cbiapts.c @@ -543,7 +543,7 @@ static void TestBreakIteratorRules() { * keep together 'abc', but only when followed by 'def', OTHERWISE * just return one char at a time. */ - char rules[] = "abc{666}/def;\n [\\p{L} - [a]]* {2}; . {1};"; + char rules[] = "abc/def{666};\n [\\p{L} - [a]]* {2}; . {1};"; /* 0123456789012345678 */ char data[] = "abcdex abcdefgh-def"; /* the test data string */ char breaks[] = "** ** * ** *"; /* * the expected break positions */ diff --git a/icu4c/source/test/intltest/Makefile.in b/icu4c/source/test/intltest/Makefile.in index 5c5fa98ae55..e929f7f30df 100644 --- a/icu4c/source/test/intltest/Makefile.in +++ b/icu4c/source/test/intltest/Makefile.in @@ -49,7 +49,7 @@ tfsmalls.o tmsgfmt.o trcoll.o tscoll.o tsdate.o tsdcfmsy.o tsdtfmsy.o \ tsmthred.o tsnmfmt.o tsputil.o tstnrapi.o tstnorm.o tzbdtest.o \ tzregts.o tztest.o ucdtest.o usettest.o ustrtest.o strcase.o transtst.o strtest.o thcoll.o \ bytestrietest.o ucharstrietest.o \ -itrbbi.o rbbiapts.o rbbitst.o ittrans.o transapi.o cpdtrtst.o \ +itrbbi.o rbbiapts.o rbbitst.o rbbimonkeytest.o ittrans.o transapi.o cpdtrtst.o \ testutil.o transrt.o trnserr.o normconf.o sfwdchit.o \ jamotest.o srchtest.o reptest.o regextst.o \ itrbnf.o itrbnfrt.o itrbnfp.o ucaconf.o icusvtst.o \ diff --git a/icu4c/source/test/intltest/intltest.cpp b/icu4c/source/test/intltest/intltest.cpp index f641f32be3d..4a0a3e31972 100644 --- a/icu4c/source/test/intltest/intltest.cpp +++ b/icu4c/source/test/intltest/intltest.cpp @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1997-2015, International Business Machines Corporation and + * Copyright (c) 1997-2016, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ @@ -1790,6 +1790,39 @@ float IntlTest::random() { return random(&RAND_SEED); } + +/* + * Integer random number class implementation. + * Similar to C++ std::minstd_rand, with the same algorithm & constants. + */ +IntlTest::icu_rand::icu_rand(uint32_t seed) { + seed = seed % 2147483647UL; + if (seed == 0) { + seed = 1; + } + fLast = seed; +} + +IntlTest::icu_rand::~icu_rand() {}; + +void IntlTest::icu_rand::seed(uint32_t seed) { + if (seed == 0) { + seed = 1; + } + fLast = seed; +} + +uint32_t IntlTest::icu_rand::operator() () { + fLast = ((uint64_t)fLast * 48271UL) % 2147483647UL; + return fLast; +} + +uint32_t IntlTest::icu_rand::getSeed() { + return (uint32_t) fLast; +} + + + static inline UChar toHex(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); } diff --git a/icu4c/source/test/intltest/intltest.h b/icu4c/source/test/intltest/intltest.h index 58877cf94d1..b8793ba889d 100644 --- a/icu4c/source/test/intltest/intltest.h +++ b/icu4c/source/test/intltest/intltest.h @@ -1,6 +1,6 @@ /******************************************************************** - * COPYRIGHT: - * Copyright (c) 1997-2015, International Business Machines Corporation and + * COPYRIGHT: + * Copyright (c) 1997-2016, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ @@ -18,7 +18,7 @@ #if U_NO_DEFAULT_INCLUDE_UTF_HEADERS /* deprecated - make tests pass with U_NO_DEFAULT_INCLUDE_UTF_HEADERS */ -#include "unicode/utf_old.h" +#include "unicode/utf_old.h" #endif U_NAMESPACE_USE @@ -166,7 +166,7 @@ public: /** * Replaces isICUVersionAtLeast and isICUVersionBefore * log that an issue is known. - * Usually used this way: + * Usually used this way: * if( ... && logKnownIssue("12345", "some bug")) continue; * @param ticket ticket string, "12345" or "cldrbug:1234" * @param message optional message string @@ -230,11 +230,11 @@ public: void errcheckln(UErrorCode status, const char *fmt, ...); // Print ALL named errors encountered so far - void printErrors(); + void printErrors(); // print known issues. return TRUE if there were any. UBool printKnownIssues(); - + virtual void usage( void ) ; /** @@ -253,6 +253,30 @@ public: */ static float random(); + + /** + * Integer random numbers, similar to C++ std::minstd_rand, with the same algorithm + * and constants. Allow additional access to internal state, for use by monkey tests, + * which need to recreate previous random sequences beginning near a failure point. + */ + class icu_rand { + public: + icu_rand(uint32_t seed = 1); + ~icu_rand(); + void seed(uint32_t seed); + uint32_t operator()(); + /** + * Get a seed corresponding to the current state of the generator. + * Seeding any generator with this value will cause it to produce the + * same sequence as this one will from this point forward. + */ + uint32_t getSeed(); + private: + uint32_t fLast; + }; + + + enum { kMaxProps = 16 }; virtual void setProperty(const char* propline); @@ -320,7 +344,7 @@ private: int32_t dataErrorCount; IntlTest* caller; char* testPath; // specifies subtests - + char basePath[1024]; char currName[1024]; // current test name diff --git a/icu4c/source/test/intltest/intltest.vcxproj b/icu4c/source/test/intltest/intltest.vcxproj index 0368145e935..1ec077bbc49 100644 --- a/icu4c/source/test/intltest/intltest.vcxproj +++ b/icu4c/source/test/intltest/intltest.vcxproj @@ -238,6 +238,7 @@ false + @@ -434,6 +435,7 @@ + diff --git a/icu4c/source/test/intltest/intltest.vcxproj.filters b/icu4c/source/test/intltest/intltest.vcxproj.filters index 47adcd18dad..65ef6b9bd68 100644 --- a/icu4c/source/test/intltest/intltest.vcxproj.filters +++ b/icu4c/source/test/intltest/intltest.vcxproj.filters @@ -70,6 +70,9 @@ break iteration + + break iteration + spoof detection @@ -504,6 +507,9 @@ break iteration + + break iteration + spoof detection diff --git a/icu4c/source/test/intltest/itrbbi.cpp b/icu4c/source/test/intltest/itrbbi.cpp index b99a405beae..acd1a2f6177 100644 --- a/icu4c/source/test/intltest/itrbbi.cpp +++ b/icu4c/source/test/intltest/itrbbi.cpp @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 1998-2012, International Business Machines Corporation +* Copyright (C) 1998-2016, International Business Machines Corporation * and others. All Rights Reserved. ********************************************************************** */ @@ -16,30 +16,23 @@ #if !UCONFIG_NO_BREAK_ITERATION +#include "intltest.h" #include "itrbbi.h" #include "rbbiapts.h" #include "rbbitst.h" - -#define TESTCLASS(n,classname) \ - case n: \ - name = #classname; \ - if (exec) { \ - logln(#classname "---"); \ - logln(""); \ - classname t; \ - callTest(t, par); \ - } \ - break +#include "rbbimonkeytest.h" void IntlTestRBBI::runIndexedTest( int32_t index, UBool exec, const char* &name, char* par ) { - if (exec) logln("TestSuite RuleBasedBreakIterator: "); - switch (index) { - TESTCLASS(0, RBBIAPITest); - TESTCLASS(1, RBBITest); - default: name=""; break; + if (exec) { + logln("TestSuite RuleBasedBreakIterator: "); } + TESTCASE_AUTO_BEGIN; + TESTCASE_AUTO_CLASS(RBBIAPITest); + TESTCASE_AUTO_CLASS(RBBITest); + TESTCASE_AUTO_CLASS(RBBIMonkeyTest); + TESTCASE_AUTO_END; } #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ diff --git a/icu4c/source/test/intltest/rbbimonkeytest.cpp b/icu4c/source/test/intltest/rbbimonkeytest.cpp new file mode 100644 index 00000000000..5be9f229fcc --- /dev/null +++ b/icu4c/source/test/intltest/rbbimonkeytest.cpp @@ -0,0 +1,976 @@ +/******************************************************************** + * Copyright (c) 2016, International Business Machines Corporation and + * others. All Rights Reserved. + ********************************************************************/ + + +#include "unicode/utypes.h" + +#include "rbbimonkeytest.h" +#include "unicode/utypes.h" +#include "unicode/brkiter.h" +#include "unicode/utf16.h" +#include "unicode/uniset.h" +#include "unicode/unistr.h" + +#include "charstr.h" +#include "cmemory.h" +#include "cstr.h" +#include "uelement.h" +#include "uhash.h" + +#include "iostream" +#include "string" + +using namespace icu; + + +void RBBIMonkeyTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* params) { + fParams = params; // Work around TESTCASE_AUTO not being able to pass params to test function. + + TESTCASE_AUTO_BEGIN; + TESTCASE_AUTO(testMonkey); + TESTCASE_AUTO_END; +} + +//--------------------------------------------------------------------------------------- +// +// class BreakRule implementation. +// +//--------------------------------------------------------------------------------------- + +BreakRule::BreakRule() // : all field default initialized. +{ +} + +BreakRule::~BreakRule() {}; + + +//--------------------------------------------------------------------------------------- +// +// class BreakRules implementation. +// +//--------------------------------------------------------------------------------------- +BreakRules::BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status) : + fMonkeyImpl(monkeyImpl), fBreakRules(status), fType(UBRK_COUNT) { + fCharClasses.adoptInstead(uhash_open(uhash_hashUnicodeString, + uhash_compareUnicodeString, + NULL, // value comparator. + &status)); + if (U_FAILURE(status)) { + return; + } + uhash_setKeyDeleter(fCharClasses.getAlias(), uprv_deleteUObject); + uhash_setValueDeleter(fCharClasses.getAlias(), uprv_deleteUObject); + fBreakRules.setDeleter(uprv_deleteUObject); + + fCharClassList.adoptInstead(new UVector(status)); + + fSetRefsMatcher.adoptInstead(new RegexMatcher(UnicodeString( + "(?!(?:\\{|=|\\[:)[ \\t]{0,4})" // Negative lookbehind for '{' or '=' or '[:' + // (the identifier is a unicode property name or value) + "(?[A-Za-z_][A-Za-z0-9_]*)"), // The char class name + 0, status)); + + // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules. + fCommentsMatcher.adoptInstead(new RegexMatcher(UnicodeString( + "(^|(?<=;))" // Start either at start of line, or just after a ';' (look-behind for ';') + "[ \\t]*+" // Match white space. + "(#.*)?+" // Optional # plus whatever follows + "\\R$" // new-line at end of line. + ), 0, status)); + + // Match (initial parse) of a character class defintion line. + fClassDefMatcher.adoptInstead(new RegexMatcher(UnicodeString( + "[ \\t]*" // leading white space + "(?[A-Za-z_][A-Za-z0-9_]*)" // The char class name + "[ \\t]*=[ \\t]*" // = + "(?.*?)" // The char class UnicodeSet expression + "[ \\t]*;$"), // ; + 0, status)); + + // Match (initial parse) of a break rule line. + fRuleDefMatcher.adoptInstead(new RegexMatcher(UnicodeString( + "[ \\t]*" // leading white space + "(?[A-Za-z_][A-Za-z0-9_.]*)" // The rule name + "[ \\t]*:[ \\t]*" // : + "(?.*?)" // The rule definition + "[ \\t]*;$"), // ; + 0, status)); + +} + + +BreakRules::~BreakRules() {}; + + +CharClass *BreakRules::addCharClass(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) { + + // Create the expanded definition for this char class, + // replacing any set references with the corresponding definition. + + UnicodeString expandedDef; + UnicodeString emptyString; + fSetRefsMatcher->reset(definition); + while (fSetRefsMatcher->find() && U_SUCCESS(status)) { + const UnicodeString name = + fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status); + CharClass *nameClass = static_cast(uhash_get(fCharClasses.getAlias(), &name)); + const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name; + + fSetRefsMatcher->appendReplacement(expandedDef, emptyString, status); + expandedDef.append(expansionForName); + } + fSetRefsMatcher->appendTail(expandedDef); + + // Verify that the expanded set defintion is valid. + + if (fMonkeyImpl->fDumpExpansions) { + printf("epandedDef: %s\n", CStr(expandedDef)()); + } + + UnicodeSet *s = new UnicodeSet(expandedDef, USET_IGNORE_SPACE, NULL, status); + if (U_FAILURE(status)) { + IntlTest::gTest->errln("%s:%d: error %s creating UnicodeSet %s", __FILE__, __LINE__, + u_errorName(status), CStr(name)()); + return NULL; + } + CharClass *cclass = new CharClass(name, definition, expandedDef, s); + CharClass *previousClass = static_cast(uhash_put(fCharClasses.getAlias(), + new UnicodeString(name), // Key, owned by hash table. + cclass, // Value, owned by hash table. + &status)); + + if (previousClass != NULL) { + // Duplicate class def. + // These are legitimate, they are adustments of an existing class. + // TODO: will need to keep the old around when we handle tailorings. + IntlTest::gTest->logln("Redefinition of character class %s\n", CStr(cclass->fName)()); + delete previousClass; + } + return cclass; +} + + +void BreakRules::addRule(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) { + LocalPointer thisRule(new BreakRule); + thisRule->fName = name; + thisRule->fRule = definition; + + // If the rule name contains embedded digits, pad the first numeric field to a fixed length with leading zeroes, + // This gives a numeric sort order that matches Unicode UAX rule numbering conventions. + UnicodeString emptyString; + + // Expand the char class definitions within the rule. + fSetRefsMatcher->reset(definition); + while (fSetRefsMatcher->find() && U_SUCCESS(status)) { + const UnicodeString name = + fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status); + CharClass *nameClass = static_cast(uhash_get(fCharClasses.getAlias(), &name)); + if (!nameClass) { + IntlTest::gTest->errln("%s:%d char class \"%s\" unrecognized in rule \"%s\"", + __FILE__, __LINE__, CStr(name)(), CStr(definition)()); + } + const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name; + + fSetRefsMatcher->appendReplacement(thisRule->fExpandedRule, emptyString, status); + thisRule->fExpandedRule.append(expansionForName); + } + fSetRefsMatcher->appendTail(thisRule->fExpandedRule); + + // Replace the divide sign (\u00f7) with a regular expression named capture. + // When running the rules, a match that includes this group means we found a break position. + + int32_t dividePos = thisRule->fExpandedRule.indexOf((UChar)0x00f7); + if (dividePos >= 0) { + thisRule->fExpandedRule.replace(dividePos, 1, UnicodeString("(?)")); + } + if (thisRule->fExpandedRule.indexOf((UChar)0x00f7) != -1) { + status = U_ILLEGAL_ARGUMENT_ERROR; // TODO: produce a good error message. + } + + // UAX break rule set definitions can be empty, just []. + // Regular expression set expressions don't accept this. Substitute with [^\u0000-\U0010ffff], which + // also matches nothing. + + static const UChar emptySet[] = {(UChar)0x5b, (UChar)0x5d, 0}; + int32_t where = 0; + while ((where = thisRule->fExpandedRule.indexOf(emptySet, 2, 0)) >= 0) { + thisRule->fExpandedRule.replace(where, 2, UnicodeString("[^\\u0000-\\U0010ffff]")); + } + if (fMonkeyImpl->fDumpExpansions) { + printf("fExpandedRule: %s\n", CStr(thisRule->fExpandedRule)()); + } + + // Compile a regular expression for this rule. + thisRule->fRuleMatcher.adoptInstead(new RegexMatcher(thisRule->fExpandedRule, UREGEX_COMMENTS | UREGEX_DOTALL, status)); + if (U_FAILURE(status)) { + IntlTest::gTest->errln("%s:%d Error creating regular expression for %s", + __FILE__, __LINE__, CStr(thisRule->fExpandedRule)()); + return; + } + + // Put this new rule into the vector of all Rules. + fBreakRules.addElement(thisRule.orphan(), status); +} + + +bool BreakRules::setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status) { + if (keyword == UnicodeString("locale")) { + CharString localeName; + localeName.append(CStr(value)(), -1, status); + fLocale = Locale::createFromName(localeName.data()); + return true; + } + if (keyword == UnicodeString("type")) { + if (value == UnicodeString("grapheme")) { + fType = UBRK_CHARACTER; + } else if (value == UnicodeString("word")) { + fType = UBRK_WORD; + } else if (value == UnicodeString("line")) { + fType = UBRK_LINE; + } else if (value == UnicodeString("sentence")) { + fType = UBRK_SENTENCE; + } else { + IntlTest::gTest->errln("%s:%d Unrecognized break type %s", __FILE__, __LINE__, CStr(value)()); + } + return true; + } + // TODO: add tailoring base setting here. + return false; +} + +RuleBasedBreakIterator *BreakRules::createICUBreakIterator(UErrorCode &status) { + if (U_FAILURE(status)) { + return NULL; + } + RuleBasedBreakIterator *bi = NULL; + switch(fType) { + case UBRK_CHARACTER: + bi = dynamic_cast(BreakIterator::createCharacterInstance(fLocale, status)); + break; + case UBRK_WORD: + bi = dynamic_cast(BreakIterator::createWordInstance(fLocale, status)); + break; + case UBRK_LINE: + bi = dynamic_cast(BreakIterator::createLineInstance(fLocale, status)); + break; + case UBRK_SENTENCE: + bi = dynamic_cast(BreakIterator::createSentenceInstance(fLocale, status)); + break; + default: + IntlTest::gTest->errln("%s:%d Bad break iterator type of %d", __FILE__, __LINE__, fType); + status = U_ILLEGAL_ARGUMENT_ERROR; + } + return bi; +} + + +void BreakRules::compileRules(UCHARBUF *rules, UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + + UnicodeString emptyString; + for (int32_t lineNumber=0; ;lineNumber++) { // Loop once per input line. + if (U_FAILURE(status)) { + return; + } + int32_t lineLength = 0; + const UChar *lineBuf = ucbuf_readline(rules, &lineLength, &status); + if (lineBuf == NULL) { + break; + } + UnicodeString line(lineBuf, lineLength); + + // Strip comment lines. + fCommentsMatcher->reset(line); + line = fCommentsMatcher->replaceFirst(emptyString, status); + if (line.isEmpty()) { + continue; + } + + // Recognize character class definition and keyword lines + fClassDefMatcher->reset(line); + if (fClassDefMatcher->matches(status)) { + UnicodeString className = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassName", status), status); + UnicodeString classDef = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassDef", status), status); + if (fMonkeyImpl->fDumpExpansions) { + printf("scanned class: %s = %s\n", CStr(className)(), CStr(classDef)()); + } + if (setKeywordParameter(className, classDef, status)) { + // The scanned item was "type = ..." or "locale = ...", etc. + // which are not actual character classes. + continue; + } + addCharClass(className, classDef, status); + continue; + } + + // Recognize rule lines. + fRuleDefMatcher->reset(line); + if (fRuleDefMatcher->matches(status)) { + UnicodeString ruleName = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleName", status), status); + UnicodeString ruleDef = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleDef", status), status); + if (fMonkeyImpl->fDumpExpansions) { + printf("scanned rule: %s : %s\n", CStr(ruleName)(), CStr(ruleDef)()); + } + addRule(ruleName, ruleDef, status); + continue; + } + + IntlTest::gTest->errln("%s:%d: Unrecognized line in rule file %s: \"%s\"\n", + __FILE__, __LINE__, fMonkeyImpl->fRuleFileName, CStr(line)()); + } + + // Build the vector of char classes, omitting the dictionary class if there is one. + // This will be used when constructing the random text to be tested. + + // Also compute the "other" set, consisting of any characters not included in + // one or more of the user defined sets. + + UnicodeSet otherSet((UChar32)0, 0x10ffff); + int32_t pos = UHASH_FIRST; + const UHashElement *el = NULL; + while ((el = uhash_nextElement(fCharClasses.getAlias(), &pos)) != NULL) { + const UnicodeString *ccName = static_cast(el->key.pointer); + CharClass *cclass = static_cast(el->value.pointer); + // printf(" Adding %s\n", CStr(*ccName)()); + if (*ccName != cclass->fName) { + IntlTest::gTest->errln("%s:%d: internal error, set names (%s, %s) inconsistent.\n", + __FILE__, __LINE__, CStr(*ccName)(), CStr(cclass->fName)()); + } + const UnicodeSet *set = cclass->fSet.getAlias(); + otherSet.removeAll(*set); + if (*ccName == UnicodeString("dictionary")) { + fDictionarySet = *set; + } else { + fCharClassList->addElement(cclass, status); + } + } + + if (!otherSet.isEmpty()) { + // fprintf(stderr, "have an other set.\n"); + UnicodeString pattern; + CharClass *cclass = addCharClass(UnicodeString("__Others"), otherSet.toPattern(pattern), status); + fCharClassList->addElement(cclass, status); + } +} + + +const CharClass *BreakRules::getClassForChar(UChar32 c, int32_t *iter) const { + int32_t localIter = 0; + int32_t &it = iter? *iter : localIter; + + while (it < fCharClassList->size()) { + const CharClass *cc = static_cast(fCharClassList->elementAt(it)); + ++it; + if (cc->fSet->contains(c)) { + return cc; + } + } + return NULL; +} + +//--------------------------------------------------------------------------------------- +// +// class MonkeyTestData implementation. +// +//--------------------------------------------------------------------------------------- + +void MonkeyTestData::set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status) { + const int32_t dataLength = 1000; + + // Fill the test string with random characters. + // First randomly pick a char class, then randomly pick a character from that class. + // Exclude any characters from the dictionary set. + + // std::cout << "Populating Test Data" << std::endl; + fRandomSeed = rand.getSeed(); // Save initial seed for use in error messages, + // allowing recreation of failing data. + fBkRules = rules; + fString.remove(); + for (int32_t n=0; nfCharClassList->size(); + const CharClass *cclass = static_cast(rules->fCharClassList->elementAt(charClassIndex)); + if (cclass->fSet->size() == 0) { + // Some rules or tailorings do end up with empty char classes. + continue; + } + int32_t charIndex = rand() % cclass->fSet->size(); + UChar32 c = cclass->fSet->charAt(charIndex); + if (U16_IS_TRAIL(c) && fString.length() > 0 && U16_IS_LEAD(fString.charAt(fString.length()-1))) { + // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control. + // Don't let random unpaired surrogates combine in the test data because they might + // produce an unwanted dictionary character. + continue; + } + + if (!rules->fDictionarySet.contains(c)) { + fString.append(c); + ++n; + } + } + + // Reset each rule matcher regex with this new string. + // (Although we are always using the same string object, ICU regular expressions + // don't like the underlying string data changing without doing a reset). + + for (int32_t ruleNum=0; ruleNumfBreakRules.size(); ruleNum++) { + BreakRule *rule = static_cast(rules->fBreakRules.elementAt(ruleNum)); + rule->fRuleMatcher->reset(fString); + } + + // Init the expectedBreaks, actualBreaks and ruleForPosition strings (used as arrays). + // Expected and Actual breaks are one longer than the input string; a non-zero value + // will indicate a boundary preceding that position. + + clearActualBreaks(); + fExpectedBreaks = fActualBreaks; + fRuleForPosition = fActualBreaks; + f2ndRuleForPos = fActualBreaks; + + // Apply reference rules to find the expected breaks. + + fExpectedBreaks.setCharAt(0, (UChar)1); // Force an expected break before the start of the text. + // ICU always reports a break there. + // The reference rules do not have a means to do so. + int32_t strIdx = 0; + while (strIdx < fString.length()) { + BreakRule *matchingRule = NULL; + UBool hasBreak = FALSE; + int32_t ruleNum = 0; + int32_t matchStart = 0; + int32_t matchEnd = 0; + int32_t breakGroup = 0; + for (ruleNum=0; ruleNumfBreakRules.size(); ruleNum++) { + BreakRule *rule = static_cast(rules->fBreakRules.elementAt(ruleNum)); + rule->fRuleMatcher->reset(); + if (rule->fRuleMatcher->lookingAt(strIdx, status)) { + // A candidate rule match, check further to see if we take it or continue to check other rules. + // Matches of zero or one codepoint count only if they also specify a break. + matchStart = rule->fRuleMatcher->start(status); + matchEnd = rule->fRuleMatcher->end(status); + breakGroup = rule->fRuleMatcher->pattern().groupNumberFromName("BreakPosition", status); + hasBreak = U_SUCCESS(status); + if (status == U_REGEX_INVALID_CAPTURE_GROUP_NAME) { + status = U_ZERO_ERROR; + } + if (hasBreak || fString.moveIndex32(matchStart, 1) < matchEnd) { + matchingRule = rule; + break; + } + } + } + if (matchingRule == NULL) { + // No reference rule matched. This is an error in the rules that should never happen. + IntlTest::gTest->errln("%s:%d Trouble with monkey test reference rules at position %d. ", + __FILE__, __LINE__, strIdx); + dump(strIdx); + status = U_INVALID_FORMAT_ERROR; + return; + } + if (matchingRule->fRuleMatcher->group(status).length() == 0) { + // Zero length rule match. This is also an error in the rule expressions. + IntlTest::gTest->errln("%s:%d Zero length rule match.", + __FILE__, __LINE__); + status = U_INVALID_FORMAT_ERROR; + return; + } + + // Record which rule matched over the length of the match. + for (int i = matchStart; i < matchEnd; i++) { + if (fRuleForPosition.charAt(i) == 0) { + fRuleForPosition.setCharAt(i, (UChar)ruleNum); + } else { + f2ndRuleForPos.setCharAt(i, (UChar)ruleNum); + } + } + + // Break positions appear in rules as a matching named capture of zero length at the break position, + // the adjusted pattern contains (?) + if (hasBreak) { + int32_t breakPos = matchingRule->fRuleMatcher->start(breakGroup, status); + if (U_FAILURE(status) || breakPos < 0) { + // Rule specified a break, but that break wasn't part of the match, even + // though the rule as a whole matched. + // Can't happen with regular expressions derived from (equivalent to) ICU break rules. + // Shouldn't get here. + IntlTest::gTest->errln("%s:%d Internal Rule Error.", __FILE__, __LINE__); + status = U_INVALID_FORMAT_ERROR; + break; + } + fExpectedBreaks.setCharAt(breakPos, (UChar)1); + // printf("recording break at %d\n", breakPos); + // For the next iteration, pick up applying rules immediately after the break, + // which may differ from end of the match. The matching rule may have included + // context following the boundary that needs to be looked at again. + strIdx = matchingRule->fRuleMatcher->end(breakGroup, status); + } else { + // Original rule didn't specify a break. + // Continue applying rules starting on the last code point of this match. + strIdx = fString.moveIndex32(matchEnd, -1); + if (strIdx == matchStart) { + // Match was only one code point, no progress if we continue. + // Shouldn't get here, case is filtered out at top of loop. + CharString ruleName; + ruleName.appendInvariantChars(matchingRule->fName, status); + IntlTest::gTest->errln("%s:%d Rule %s internal error", + __FILE__, __LINE__, ruleName.data()); + status = U_INVALID_FORMAT_ERROR; + break; + } + } + if (U_FAILURE(status)) { + IntlTest::gTest->errln("%s:%d status = %s. Unexpected failure, perhaps problem internal to test.", + __FILE__, __LINE__, u_errorName(status)); + break; + } + } +} + +void MonkeyTestData::clearActualBreaks() { + fActualBreaks.remove(); + // Actual Breaks length is one longer than the data string length, allowing + // for breaks before the first and after the last character in the data. + for (int32_t i=0; i<=fString.length(); i++) { + fActualBreaks.append((UChar)0); + } +} + +void MonkeyTestData::dump(int32_t around) const { + printf("\n" + " char break Rule Character\n" + " pos code class R I name name\n" + "---------------------------------------------------------------------------------------------\n"); + + int32_t start; + int32_t end; + + if (around == -1) { + start = 0; + end = fString.length(); + } else { + // Display context around a failure. + start = fString.moveIndex32(around, -30); + end = fString.moveIndex32(around, +30); + } + + for (int charIdx = start; charIdx < end; charIdx=fString.moveIndex32(charIdx, 1)) { + UErrorCode status = U_ZERO_ERROR; + UChar32 c = fString.char32At(charIdx); + const CharClass *cc = fBkRules->getClassForChar(c); + CharString ccName; + ccName.appendInvariantChars(cc->fName, status); + CharString ruleName, secondRuleName; + const BreakRule *rule = static_cast(fBkRules->fBreakRules.elementAt(fRuleForPosition.charAt(charIdx))); + ruleName.appendInvariantChars(rule->fName, status); + if (f2ndRuleForPos.charAt(charIdx) > 0) { + const BreakRule *secondRule = static_cast(fBkRules->fBreakRules.elementAt(f2ndRuleForPos.charAt(charIdx))); + secondRuleName.appendInvariantChars(secondRule->fName, status); + } + char cName[200]; + u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status); + + printf(" %4.1d %6.4x %-20s %c %c %-10s %-10s %s\n", + charIdx, c, ccName.data(), + fExpectedBreaks.charAt(charIdx) ? '*' : '.', + fActualBreaks.charAt(charIdx) ? '*' : '.', + ruleName.data(), secondRuleName.data(), cName + ); + } +} + + +//--------------------------------------------------------------------------------------- +// +// class RBBIMonkeyImpl +// +//--------------------------------------------------------------------------------------- + +RBBIMonkeyImpl::RBBIMonkeyImpl(UErrorCode &status) : fDumpExpansions(FALSE), fThread(this) { + (void)status; // suppress unused parameter compiler warning. +} + + +// RBBIMonkeyImpl setup does all of the setup for a single rule set - compiling the +// reference rules and creating the icu breakiterator to test, +// with its type and locale coming from the reference rules. + +void RBBIMonkeyImpl::setup(const char *ruleFile, UErrorCode &status) { + fRuleFileName = ruleFile; + openBreakRules(ruleFile, status); + if (U_FAILURE(status)) { + IntlTest::gTest->errln("%s:%d Error %s opening file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile); + return; + } + fRuleSet.adoptInstead(new BreakRules(this, status)); + fRuleSet->compileRules(fRuleCharBuffer.getAlias(), status); + if (U_FAILURE(status)) { + IntlTest::gTest->errln("%s:%d Error %s processing file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile); + return; + } + fBI.adoptInstead(fRuleSet->createICUBreakIterator(status)); + fTestData.adoptInstead(new MonkeyTestData()); +} + + +RBBIMonkeyImpl::~RBBIMonkeyImpl() { +} + + +void RBBIMonkeyImpl::openBreakRules(const char *fileName, UErrorCode &status) { + CharString path; + path.append(IntlTest::getSourceTestData(status), status); + path.append("break_rules" U_FILE_SEP_STRING, status); + path.appendPathPart(fileName, status); + const char *codePage = "UTF-8"; + fRuleCharBuffer.adoptInstead(ucbuf_open(path.data(), &codePage, TRUE, FALSE, &status)); +} + + +void RBBIMonkeyImpl::startTest() { + fThread.start(); // invokes runTest() in a separate thread. +} + +void RBBIMonkeyImpl::join() { + fThread.join(); +} + + +#define MONKEY_ERROR(msg, index) { \ + IntlTest::gTest->errln("%s:%d %s at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", \ + __FILE__, __LINE__, msg, index, fRuleFileName, fTestData->fRandomSeed); \ + if (fVerbose) { fTestData->dump(index); } \ + status = U_INVALID_STATE_ERROR; \ +} + +void RBBIMonkeyImpl::runTest() { + UErrorCode status = U_ZERO_ERROR; + int32_t errorCount = 0; + for (int64_t loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) { + status = U_ZERO_ERROR; + fTestData->set(fRuleSet.getAlias(), fRandomGenerator, status); + // fTestData->dump(); + testForwards(status); + testPrevious(status); + testFollowing(status); + testPreceding(status); + testIsBoundary(status); + + if (fLoopCount < 0 && loopCount % 100 == 0) { + fprintf(stderr, "."); + } + if (U_FAILURE(status)) { + if (++errorCount > 10) { + return; + } + } + } +} + +void RBBIMonkeyImpl::testForwards(UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + fTestData->clearActualBreaks(); + fBI->setText(fTestData->fString); + int32_t previousBreak = -2; + for (int32_t bk=fBI->first(); bk != BreakIterator::DONE; bk=fBI->next()) { + if (bk <= previousBreak) { + MONKEY_ERROR("Break Iterator Stall", bk); + return; + } + if (bk < 0 || bk > fTestData->fString.length()) { + MONKEY_ERROR("Boundary out of bounds", bk); + return; + } + fTestData->fActualBreaks.setCharAt(bk, 1); + } + checkResults("testForwards", FORWARD, status); +} + +void RBBIMonkeyImpl::testFollowing(UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + fTestData->clearActualBreaks(); + fBI->setText(fTestData->fString); + int32_t nextBreak = -1; + for (int32_t i=-1 ; ifString.length(); ++i) { + int32_t bk = fBI->following(i); + if (bk == BreakIterator::DONE && i == fTestData->fString.length()) { + continue; + } + if (bk == nextBreak && bk > i) { + // i is in the gap between two breaks. + continue; + } + if (i == nextBreak && bk > nextBreak) { + fTestData->fActualBreaks.setCharAt(bk, 1); + nextBreak = bk; + continue; + } + MONKEY_ERROR("following(i)", i); + return; + } + checkResults("testFollowing", FORWARD, status); +} + + + +void RBBIMonkeyImpl::testPrevious(UErrorCode &status) { + if (U_FAILURE(status)) {return;} + + fTestData->clearActualBreaks(); + fBI->setText(fTestData->fString); + int32_t previousBreak = INT32_MAX; + for (int32_t bk=fBI->last(); bk != BreakIterator::DONE; bk=fBI->previous()) { + if (bk >= previousBreak) { + MONKEY_ERROR("Break Iterator Stall", bk); + return; + } + if (bk < 0 || bk > fTestData->fString.length()) { + MONKEY_ERROR("Boundary out of bounds", bk); + return; + } + fTestData->fActualBreaks.setCharAt(bk, 1); + } + checkResults("testPrevius", REVERSE, status); +} + + +void RBBIMonkeyImpl::testPreceding(UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + fTestData->clearActualBreaks(); + fBI->setText(fTestData->fString); + int32_t nextBreak = fTestData->fString.length()+1; + for (int32_t i=fTestData->fString.length()+1 ; i>=0; --i) { + int32_t bk = fBI->preceding(i); + // printf("i:%d bk:%d nextBreak:%d\n", i, bk, nextBreak); + if (bk == BreakIterator::DONE && i == 0) { + continue; + } + if (bk == nextBreak && bk < i) { + // i is in the gap between two breaks. + continue; + } + if (ifString.length() && fTestData->fString.getChar32Start(i) < i) { + // i indexes to a trailing surrogate. + // Break Iterators treat an index to either half as referring to the supplemental code point, + // with preceding going to some preceding code point. + if (fBI->preceding(i) != fBI->preceding(fTestData->fString.getChar32Start(i))) { + MONKEY_ERROR("preceding of trailing surrogate error", i); + } + continue; + } + if (i == nextBreak && bk < nextBreak) { + fTestData->fActualBreaks.setCharAt(bk, 1); + nextBreak = bk; + continue; + } + MONKEY_ERROR("preceding(i)", i); + return; + } + checkResults("testPreceding", REVERSE, status); +} + + +void RBBIMonkeyImpl::testIsBoundary(UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + fTestData->clearActualBreaks(); + fBI->setText(fTestData->fString); + for (int i=fTestData->fString.length(); i>=0; --i) { + if (fBI->isBoundary(i)) { + fTestData->fActualBreaks.setCharAt(i, 1); + } + } + checkResults("testForwards", FORWARD, status); +} + +void RBBIMonkeyImpl::checkResults(const char *msg, CheckDirection direction, UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + if (direction == FORWARD) { + for (int i=0; i<=fTestData->fString.length(); ++i) { + if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) { + IntlTest::gTest->errln("%s:%d %s failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", + __FILE__, __LINE__, msg, i, fRuleFileName, fTestData->fRandomSeed); + if (fVerbose) { + fTestData->dump(i); + } + status = U_INVALID_STATE_ERROR; // Prevent the test from continuing, which would likely + break; // produce many redundant errors. + } + } + } else { + for (int i=fTestData->fString.length(); i>=0; i--) { + if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) { + IntlTest::gTest->errln("%s:%d %s failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", + __FILE__, __LINE__, msg, i, fRuleFileName, fTestData->fRandomSeed); + if (fVerbose) { + fTestData->dump(i); + } + status = U_INVALID_STATE_ERROR; + break; + } + } + } +} + + + +//--------------------------------------------------------------------------------------- +// +// class RBBIMonkeyTest implementation. +// +//--------------------------------------------------------------------------------------- +RBBIMonkeyTest::RBBIMonkeyTest() { +} + +RBBIMonkeyTest::~RBBIMonkeyTest() { +} + + +// params, taken from this->fParams. +// rules=file_name Name of file containing the reference rules. +// seed=nnnnn Random number starting seed. +// Setting the seed allows errors to be reproduced. +// loop=nnn Looping count. Controls running time. +// -1: run forever. +// 0 or greater: run length. +// expansions debug option, show expansions of rules and sets. +// verbose Display details of the failure. +// +// Parameters on the intltest command line follow the test name, and are preceded by '@'. +// For example, +// intltest rbbi/RBBIMonkeyTest/testMonkey@rules=line.txt,loop=-1 +// +void RBBIMonkeyTest::testMonkey() { + // printf("Test parameters: %s\n", fParams); + UnicodeString params(fParams); + UErrorCode status = U_ZERO_ERROR; + + const char *tests[] = {"grapheme.txt", "word.txt", "line.txt", "sentence.txt", "line_normal.txt", + "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt", + NULL }; + CharString testNameFromParams; + if (getStringParam("rules", params, testNameFromParams, status)) { + tests[0] = testNameFromParams.data(); + tests[1] = NULL; + } + + int64_t loopCount = quick? 100 : 5000; + getIntParam("loop", params, loopCount, status); + + UBool dumpExpansions = FALSE; + getBoolParam("expansions", params, dumpExpansions, status); + + UBool verbose = FALSE; + getBoolParam("verbose", params, verbose, status); + + int64_t seed = 0; + getIntParam("seed", params, seed, status); + + if (params.length() != 0) { + // Options processing did not consume all of the parameters. Something unrecognized was present. + CharString unrecognizedParameters; + unrecognizedParameters.append(CStr(params)(), -1, status); + errln("%s:%d unrecognized test parameter(s) \"%s\"", __FILE__, __LINE__, unrecognizedParameters.data()); + return; + } + + UVector startedTests(status); + if (U_FAILURE(status)) { + errln("%s:%d: error %s while setting up test.", __FILE__, __LINE__, u_errorName(status)); + return; + } + + // Monkey testing is multi-threaded. + // Each set of break rules to be tested is run in a separate thread. + // Each thread/set of rules gets a separate RBBIMonkeyImpl object. + int32_t i; + for (i=0; tests[i] != NULL; ++i) { + logln("beginning testing of %s", tests[i]); + RBBIMonkeyImpl *test = new RBBIMonkeyImpl(status); + test->fDumpExpansions = dumpExpansions; + test->fVerbose = verbose; + test->fRandomGenerator.seed((uint32_t)seed); + test->fLoopCount = loopCount; + test->setup(tests[i], status); + test->startTest(); + startedTests.addElement(test, status); + if (U_FAILURE(status)) { + break; + } + } + + if (U_FAILURE(status)) { + errln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]); + } + + for (i=0; i(startedTests.elementAt(i)); + test->join(); + delete test; + } +} + + +UBool RBBIMonkeyTest::getIntParam(UnicodeString name, UnicodeString ¶ms, int64_t &val, UErrorCode &status) { + name.append(" *= *(-?\\d+) *,? *"); + RegexMatcher m(name, params, 0, status); + if (m.find()) { + // The param exists. Convert the string to an int. + CharString str; + str.append(CStr(m.group(1, status))(), -1, status); + val = strtol(str.data(), NULL, 10); + + // Delete this parameter from the params string. + m.reset(); + params = m.replaceFirst(UnicodeString(), status); + return TRUE; + } + return FALSE; +} + +UBool RBBIMonkeyTest::getStringParam(UnicodeString name, UnicodeString ¶ms, CharString &dest, UErrorCode &status) { + name.append(" *= *([^ ,]*) *,? *"); + RegexMatcher m(name, params, 0, status); + if (m.find()) { + // The param exists. + dest.append(CStr(m.group(1, status))(), -1, status); + + // Delete this parameter from the params string. + m.reset(); + params = m.replaceFirst(UnicodeString(), status); + return TRUE; + } + return FALSE; +} + +UBool RBBIMonkeyTest::getBoolParam(UnicodeString name, UnicodeString ¶ms, UBool &dest, UErrorCode &status) { + name.append("(?: *= *(true|false))? *,? *"); + RegexMatcher m(name, params, UREGEX_CASE_INSENSITIVE, status); + if (m.find()) { + if (m.start(1, status) > 0) { + // user option included a value. + dest = m.group(1, status).caseCompare(UnicodeString("true"), U_FOLD_CASE_DEFAULT) == 0; + } else { + // No explicit user value, implies true. + dest = TRUE; + } + + // Delete this parameter from the params string. + m.reset(); + params = m.replaceFirst(UnicodeString(), status); + return TRUE; + } + return FALSE; +} + diff --git a/icu4c/source/test/intltest/rbbimonkeytest.h b/icu4c/source/test/intltest/rbbimonkeytest.h new file mode 100644 index 00000000000..484ce05ad6e --- /dev/null +++ b/icu4c/source/test/intltest/rbbimonkeytest.h @@ -0,0 +1,208 @@ +/************************************************************************* + * Copyright (c) 2016, International Business Machines + * Corporation and others. All Rights Reserved. + ************************************************************************* +*/ +#ifndef RBBIMONKEYTEST_H +#define RBBIMONKEYTEST_H + +#include "unicode/utypes.h" + +#include "intltest.h" + +#include "unicode/rbbi.h" +#include "unicode/regex.h" +#include "unicode/uniset.h" +#include "unicode/unistr.h" +#include "unicode/uobject.h" + +#include "simplethread.h" +#include "ucbuf.h" +#include "uhash.h" +#include "uvector.h" + +// +// TODO: +// Develop a tailoring format. +// Hook to old tests that use monkey impl to get expected data. +// Remove old tests. + +class BreakRules; // Forward declaration +class RBBIMonkeyImpl; + +/** + * Test the RuleBasedBreakIterator class giving different rules + */ +class RBBIMonkeyTest: public IntlTest { + public: + RBBIMonkeyTest(); + virtual ~RBBIMonkeyTest(); + + void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL ); + void testMonkey(); + + + private: + const char *fParams; // Copy of user parameters passed in from IntlTest. + + + void testRules(const char *ruleFile); + static UBool getIntParam(UnicodeString name, UnicodeString ¶ms, int64_t &val, UErrorCode &status); + static UBool getStringParam(UnicodeString name, UnicodeString ¶ms, CharString &dest, UErrorCode &status); + static UBool getBoolParam(UnicodeString name, UnicodeString ¶ms, UBool &dest, UErrorCode &status); + +}; + +// The following classes are internal to the RBBI Monkey Test implementation. + + + +// class CharClass Represents a single character class from the source break rules. +// Inherits from UObject because instances are adopted by UHashtable, which ultimately +// deletes them using hash's object deleter function. + +class CharClass: public UObject { + public: + UnicodeString fName; + UnicodeString fOriginalDef; // set definition as it appeared in user supplied rules. + UnicodeString fExpandedDef; // set definition with any embedded named sets replaced by their defs, recursively. + LocalPointer fSet; + CharClass(const UnicodeString &name, const UnicodeString &originalDef, const UnicodeString &expandedDef, const UnicodeSet *set) : + fName(name), fOriginalDef(originalDef), fExpandedDef(expandedDef), fSet(set) {} +}; + + +// class BreakRule represents a single rule from a set of break rules. +// Each rule has the set definitions expanded, and +// is compiled to a regular expression. + +class BreakRule: public UObject { + public: + BreakRule(); + ~BreakRule(); + UnicodeString fName; // Name of the rule. + UnicodeString fRule; // Rule expression, excluding the name, as written in user source. + UnicodeString fExpandedRule; // Rule expression after expanding the set definitions. + LocalPointer fRuleMatcher; // Regular expression that matches the rule. +}; + + +// class BreakRules represents a complete set of break rules, possibly tailored, +// compiled from testdata break rules. + +class BreakRules: public UObject { + public: + BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status); + ~BreakRules(); + + void compileRules(UCHARBUF *rules, UErrorCode &status); + + const CharClass *getClassForChar(UChar32 c, int32_t *iter=NULL) const; + + + RBBIMonkeyImpl *fMonkeyImpl; // Pointer back to the owning MonkeyImpl instance. + icu::UVector fBreakRules; // Contents are of type (BreakRule *). + + LocalUHashtablePointer fCharClasses; // Key is set name (UnicodeString). + // Value is (CharClass *) + LocalPointer fCharClassList; // Char Classes, same contents as fCharClasses values, + // but in a vector so they can be accessed by index. + UnicodeSet fDictionarySet; // Dictionary set, empty if none is defined. + Locale fLocale; + UBreakIteratorType fType; + + CharClass *addCharClass(const UnicodeString &name, const UnicodeString &def, UErrorCode &status); + void addRule(const UnicodeString &name, const UnicodeString &def, UErrorCode &status); + bool setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status); + RuleBasedBreakIterator *createICUBreakIterator(UErrorCode &status); + + LocalPointer fSetRefsMatcher; + LocalPointer fCommentsMatcher; + LocalPointer fClassDefMatcher; + LocalPointer fRuleDefMatcher; +}; + + +// class MonkeyTestData represents a randomly synthesized test data string together +// with the expected break positions obtained by applying +// the test break rules. + +class MonkeyTestData: public UObject { + public: + MonkeyTestData() {}; + ~MonkeyTestData() {}; + void set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status); + void clearActualBreaks(); + void dump(int32_t around = -1) const; + + uint32_t fRandomSeed; // The initial seed value from the random number genererator. + const BreakRules *fBkRules; // The break rules used to generate this data. + UnicodeString fString; // The text. + UnicodeString fExpectedBreaks; // Breaks as found by the reference rules. + // Parallel to fString. Non-zero if break preceding. + UnicodeString fActualBreaks; // Breaks as found by ICU break iterator. + UnicodeString fRuleForPosition; // Index into BreakRules.fBreakRules of rule that applied at each position. + // Also parallel to fString. + UnicodeString f2ndRuleForPos; // As above. A 2nd rule applies when the preceding rule + // didn't cause a break, and a subsequent rule match starts + // on the last code point of the preceding match. + +}; + + + + +// class RBBIMonkeyImpl holds (some indirectly) everything associated with running a monkey +// test for one set of break rules. +// +// When running RBBIMonkeyTest with multiple threads, there is a 1:1 correspondence +// between instances of RBBIMonkeyImpl and threads. +// +class RBBIMonkeyImpl: public UObject { + public: + RBBIMonkeyImpl(UErrorCode &status); + ~RBBIMonkeyImpl(); + + void setup(const char *ruleFileName, UErrorCode &status); + + void startTest(); + void runTest(); + void join(); + + LocalUCHARBUFPointer fRuleCharBuffer; // source file contents of the reference rules. + LocalPointer fRuleSet; + LocalPointer fBI; + LocalPointer fTestData; + IntlTest::icu_rand fRandomGenerator; + const char *fRuleFileName; + UBool fVerbose; // True to do long dump of failing data. + int32_t fLoopCount; + + UBool fDumpExpansions; // Debug flag to output epananded form of rules and sets. + + enum CheckDirection { + FORWARD = 1, + REVERSE = 2 + }; + void clearActualBreaks(); + void testForwards(UErrorCode &status); + void testPrevious(UErrorCode &status); + void testFollowing(UErrorCode &status); + void testPreceding(UErrorCode &status); + void testIsBoundary(UErrorCode &status); + void checkResults(const char *msg, CheckDirection dir, UErrorCode &status); + + class RBBIMonkeyThread: public SimpleThread { + private: + RBBIMonkeyImpl *fMonkeyImpl; + public: + RBBIMonkeyThread(RBBIMonkeyImpl *impl) : fMonkeyImpl(impl) {}; + void run() U_OVERRIDE { fMonkeyImpl->runTest(); }; + }; + private: + void openBreakRules(const char *fileName, UErrorCode &status); + RBBIMonkeyThread fThread; + +}; + +#endif // RBBIMONKEYTEST_H diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 4e436deba9f..a99b3b70a43 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -9,36 +9,36 @@ * 01/12/2000 Madhu Updated for changed API and added new tests ************************************************************************/ -#include "utypeinfo.h" // for 'typeid' to work - #include "unicode/utypes.h" - #if !UCONFIG_NO_BREAK_ITERATION -#include "unicode/utypes.h" +#include +#include +#include + #include "unicode/brkiter.h" +#include "unicode/localpointer.h" +#include "unicode/numfmt.h" #include "unicode/rbbi.h" +#if !UCONFIG_NO_REGULAR_EXPRESSIONS +#include "unicode/regex.h" +#endif +#include "unicode/schriter.h" #include "unicode/uchar.h" #include "unicode/utf16.h" #include "unicode/ucnv.h" -#include "unicode/schriter.h" #include "unicode/uniset.h" -#if !UCONFIG_NO_REGULAR_EXPRESSIONS -#include "unicode/regex.h" -#endif +#include "unicode/uscript.h" #include "unicode/ustring.h" #include "unicode/utext.h" + +#include "charstr.h" +#include "cmemory.h" #include "intltest.h" #include "rbbitst.h" -#include -#include "charstr.h" +#include "utypeinfo.h" // for 'typeid' to work #include "uvector.h" #include "uvectr32.h" -#include -#include -#include "unicode/numfmt.h" -#include "unicode/uscript.h" -#include "cmemory.h" #if !UCONFIG_NO_FILTERED_BREAK_ITERATION #include "unicode/filteredbrk.h" @@ -56,7 +56,7 @@ //--------------------------------------------- -// Note: Before adding new tests to this file, check whether the desired test data can +// Note: Before adding new tests to this file, check whether the desired test data can // simply be added to the file testdata/rbbitest.txt. In most cases it can, // it's much less work than writing a new test, diagnostic output in the event of failures // is good, and the test data file will is shared with ICU4J, so eventually the test @@ -79,7 +79,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha break; case 2: name = "TestStatusReturn"; if(exec) TestStatusReturn(); break; - + #if !UCONFIG_NO_FILE_IO case 3: name = "TestUnicodeFiles"; if(exec) TestUnicodeFiles(); break; @@ -117,7 +117,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha #endif #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO - case 16: + case 16: name = "TestMonkey"; if(exec) TestMonkey(params); break; #else case 16: @@ -323,7 +323,7 @@ void RBBITest::TestStatusReturn() { "$Numbers = [:N:];\n" "$Letters+{1};\n" "$Numbers+{2};\n" - "Help\\ {4}/me\\!;\n" + "Help\\ /me\\!{4};\n" "[^$Letters $Numbers];\n" "!.*;\n", -1, US_INV); UnicodeString testString1 = "abc123..abc Help me Help me!"; @@ -334,28 +334,27 @@ void RBBITest::TestStatusReturn() { UErrorCode status=U_ZERO_ERROR; UParseError parseError; - BreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status); + LocalPointer bi(new RuleBasedBreakIterator(rulesString1, parseError, status)); if(U_FAILURE(status)) { - dataerrln("FAIL : in construction - %s", u_errorName(status)); - } else { - int32_t pos; - int32_t i = 0; - bi->setText(testString1); - for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) { - if (pos != bounds1[i]) { - errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos); - break; - } + dataerrln("%s:%d error in break iterator construction - %s", __FILE__, __LINE__, u_errorName(status)); + return; + } + int32_t pos; + int32_t i = 0; + bi->setText(testString1); + for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) { + if (pos != bounds1[i]) { + errln("%s:%d expected break at %d, got %d\n", __FILE__, __LINE__, bounds1[i], pos); + break; + } - int tag = bi->getRuleStatus(); - if (tag != brkStatus[i]) { - errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag); - break; - } - i++; + int tag = bi->getRuleStatus(); + if (tag != brkStatus[i]) { + errln("%s:%d break at %d, expected tag %d, got tag %d\n", __FILE__, __LINE__, pos, brkStatus[i], tag); + break; } + i++; } - delete bi; } @@ -817,7 +816,7 @@ void RBBITest::TestBug5775() { if (bi == NULL) { return; } - + UnicodeString s("One.\\u00ad Two.", -1, US_INV); // 01234 56789 s = s.unescape(); @@ -869,7 +868,7 @@ struct TestParams { utext_close(textToBreak); delete textMap; } - + int32_t getSrcLine(int32_t bp); int32_t getExpectedBreak(int32_t bp); int32_t getSrcCol(int32_t bp); @@ -901,7 +900,7 @@ static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorC 0xfffd, NULL, &status); dest.append(buffer, utf8Length, status); } - + void TestParams::setUTF16(UErrorCode &status) { textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status); @@ -1578,7 +1577,7 @@ void RBBITest::TestDictRules() { //------------------------------------------------------------------------------- // // ReadAndConvertFile Read a text data file, convert it to UChars, and -// return the datain one big UChar * buffer, which the caller must delete. +// return the data in one big UChar * buffer, which the caller must delete. // // parameters: // fileName: the name of the file, with no directory part. The test data directory @@ -1780,7 +1779,7 @@ void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator * } strcpy(testFileName, testDataDirectory); strcat(testFileName, fileName); - + logln("Opening data file %s\n", fileName); int len; @@ -1858,7 +1857,7 @@ void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator * else if (tokenMatcher.start(4, status) >= 0) { // Scanned to end of a line, possibly skipping over a comment in the process. // If the line from the file contained test data, run the test now. - if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) { + if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) { checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi); } @@ -2030,6 +2029,10 @@ private: UnicodeSet *fLVTSet; UnicodeSet *fHangulSet; UnicodeSet *fAnySet; + UnicodeSet *fEmojiModifierSet; + UnicodeSet *fEmojiBaseSet; + UnicodeSet *fZWJSet; + UnicodeSet *fGAZSet; const UnicodeString *fText; }; @@ -2041,8 +2044,8 @@ RBBICharMonkey::RBBICharMonkey() { fText = NULL; fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status); - fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status); - fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status); + fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]-[:Block=Tags:]]"), status); + fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}][:Block=Tags:]]"), status); fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status); fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status); fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status); @@ -2059,6 +2062,18 @@ RBBICharMonkey::RBBICharMonkey() { fHangulSet->addAll(*fLVTSet); fAnySet = new UnicodeSet(0, 0x10ffff); + + + fEmojiBaseSet = new UnicodeSet(UnicodeString( + "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443" + "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483" + "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647" + "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"), status); + + fEmojiModifierSet = new UnicodeSet(0x0001F3FB, 0x0001F3FF); + fZWJSet = new UnicodeSet(0x200D, 0x200D); + fGAZSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F466-\\U0001F469\\U0001F48B\\U0001F5E8\\u2764]"), status); + fSets = new UVector(status); fSets->addElement(fCRLFSet, status); fSets->addElement(fControlSet, status); @@ -2070,6 +2085,10 @@ RBBICharMonkey::RBBICharMonkey() { fSets->addElement(fSpacingSet, status); fSets->addElement(fHangulSet, status); fSets->addElement(fAnySet, status); + fSets->addElement(fEmojiBaseSet, status); + fSets->addElement(fEmojiModifierSet, status); + fSets->addElement(fZWJSet, status); + fSets->addElement(fGAZSet, status); if (U_FAILURE(status)) { deferredStatus = status; } @@ -2090,7 +2109,7 @@ int32_t RBBICharMonkey::next(int32_t prevPos) { int breakPos = -1; UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. - + if (U_FAILURE(deferredStatus)) { return -1; } @@ -2171,12 +2190,20 @@ int32_t RBBICharMonkey::next(int32_t prevPos) { } // Rule (GB8a) Regional_Indicator x Regional_Indicator + // Note: The first if condition is a little tricky. We only need to force + // a break if there are three or more contiguous RIs. If there are + // only two, a break following will occur via other rules, and will include + // any trailing extend characters, which is needed behavior. + if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1) + && fRegionalIndicatorSet->contains(c2)) { + break; + } if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { continue; } - // Rule (GB9) Numeric x ALetter - if (fExtendSet->contains(c2)) { + // Rule (GB9) x Extend + if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) { continue; } @@ -2190,6 +2217,16 @@ int32_t RBBICharMonkey::next(int32_t prevPos) { continue; } + // Rule (GB9c) Emoji_Base x Emoji_Modifier + if ((fEmojiBaseSet->contains(c1) || fGAZSet->contains(c1)) && fEmojiModifierSet->contains(c2)) { + continue; + } + + // Rule (GB9d) ZWJ x Glue_After_Zwj + if (fZWJSet->contains(c1) && fGAZSet->contains(c2)) { + continue; + } + // Rule (GB10) Any Any break; } @@ -2220,6 +2257,10 @@ RBBICharMonkey::~RBBICharMonkey() { delete fLVTSet; delete fHangulSet; delete fAnySet; + delete fEmojiBaseSet; + delete fEmojiModifierSet; + delete fZWJSet; + delete fGAZSet; } //------------------------------------------------------------------------------------------ @@ -2245,7 +2286,7 @@ private: UnicodeSet *fKatakanaSet; UnicodeSet *fHebrew_LetterSet; UnicodeSet *fALetterSet; - // TODO(jungshik): Do we still need this change? + // TODO(jungshik): Do we still need this change? // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt UnicodeSet *fSingle_QuoteSet; UnicodeSet *fDouble_QuoteSet; @@ -2258,6 +2299,10 @@ private: UnicodeSet *fExtendSet; UnicodeSet *fExtendNumLetSet; UnicodeSet *fDictionaryCjkSet; + UnicodeSet *fEBaseSet; + UnicodeSet *fEModifierSet; + UnicodeSet *fZWSSet; + UnicodeSet *fGAZSet; const UnicodeString *fText; }; @@ -2275,7 +2320,7 @@ RBBIWordMonkey::RBBIWordMonkey() fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status); // Exclude Hangul syllables from ALetterSet during testing. // Leave CJK dictionary characters out from the monkey tests! -#if 0 +#if 0 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}" "[\\p{Line_Break = Complex_Context}" "-\\p{Grapheme_Cluster_Break = Extend}" @@ -2300,6 +2345,18 @@ RBBIWordMonkey::RBBIWordMonkey() fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status); fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status); + fEBaseSet = new UnicodeSet(UnicodeString( + "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443" + "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483" + "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647" + "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"), status); + + fEModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F3FB-\\U0001F3FF]"), status); + fZWSSet = new UnicodeSet((UChar32)0x200D, (UChar32)0x200D);; + fGAZSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F466-\\U0001F469\\U0001F48B\\U0001F5E8\\u2764]"), status); + fExtendSet->removeAll(*fZWSSet); + + fOtherSet = new UnicodeSet(); if(U_FAILURE(status)) { deferredStatus = status; @@ -2322,6 +2379,11 @@ RBBIWordMonkey::RBBIWordMonkey() fOtherSet->removeAll(*fFormatSet); fOtherSet->removeAll(*fExtendSet); fOtherSet->removeAll(*fRegionalIndicatorSet); + fOtherSet->removeAll(*fEBaseSet); + fOtherSet->removeAll(*fEModifierSet); + fOtherSet->removeAll(*fZWSSet); + fOtherSet->removeAll(*fGAZSet); + // Inhibit dictionary characters from being tested at all. fOtherSet->removeAll(*fDictionaryCjkSet); fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status)); @@ -2344,6 +2406,11 @@ RBBIWordMonkey::RBBIWordMonkey() fSets->addElement(fOtherSet, status); fSets->addElement(fExtendNumLetSet, status); + fSets->addElement(fEBaseSet, status); + fSets->addElement(fEModifierSet, status); + fSets->addElement(fZWSSet, status); + fSets->addElement(fGAZSet, status); + if (U_FAILURE(status)) { deferredStatus = status; } @@ -2362,7 +2429,7 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) { int breakPos = -1; UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. - + if (U_FAILURE(deferredStatus)) { return -1; } @@ -2392,7 +2459,7 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) { break; }; } - while (fFormatSet->contains(c3) || fExtendSet->contains(c3)); + while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWSSet->contains(c3)); if (p1 == p2) { @@ -2411,7 +2478,7 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) { if (c1==0x0D && c2==0x0A) { continue; } - + // Rule (3a) Break before and after newlines (including CR and LF) // if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) { @@ -2421,6 +2488,15 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) { break; }; + // Rule (3c) ZWJ x GAZ (Glue after ZWJ). + // Not ignoring extend chars, so peek into input text to + // get the potential ZWJ, the character immediately preceding c2. + // Sloppy UChar32 indexing: p2-1 may reference trail half + // but char32At will get the full code point. + if (fZWSSet->contains(fText->char32At(p2-1)) && fGAZSet->contains(c2)) { + continue; + } + // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter) if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { @@ -2510,10 +2586,18 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) { } // Rule 13c + if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) { + break; + } if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { continue; } + // Rule 13d + if ((fEBaseSet->contains(c1) || fGAZSet->contains(c1)) && fEModifierSet->contains(c2)) { + continue; + } + // Rule 14. Break found here. break; } @@ -2548,6 +2632,10 @@ RBBIWordMonkey::~RBBIWordMonkey() { delete fRegionalIndicatorSet; delete fDictionaryCjkSet; delete fOtherSet; + delete fEBaseSet; + delete fEModifierSet; + delete fZWSSet; + delete fGAZSet; } @@ -2933,17 +3021,29 @@ private: UnicodeSet *fHL; UnicodeSet *fID; UnicodeSet *fRI; - UnicodeSet *fSA; UnicodeSet *fXX; + UnicodeSet *fEB; + UnicodeSet *fEM; + UnicodeSet *fZJ; BreakIterator *fCharBI; const UnicodeString *fText; RegexMatcher *fNumberMatcher; }; +RBBILineMonkey::RBBILineMonkey() : + RBBIMonkeyKind(), + fSets(NULL), + + fCharBI(NULL), + fText(NULL), + fNumberMatcher(NULL) -RBBILineMonkey::RBBILineMonkey() { + if (U_FAILURE(deferredStatus)) { + return; + } + UErrorCode status = U_ZERO_ERROR; fSets = new UVector(status); @@ -2985,24 +3085,35 @@ RBBILineMonkey::RBBILineMonkey() fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status); fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status); fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status); - fSA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status); fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status); fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status); + fEB = new UnicodeSet(UnicodeString( + "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443" + "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483" + "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647" + "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"), status); + fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F3FB-\\U0001F3FF]"), status); + fZJ = new UnicodeSet((UChar32)0x200D, (UChar32)0x200D); if (U_FAILURE(status)) { deferredStatus = status; - fCharBI = NULL; - fNumberMatcher = NULL; return; } fAL->addAll(*fXX); // Default behavior for XX is identical to AL fAL->addAll(*fAI); // Default behavior for AI is identical to AL - fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL fAL->addAll(*fSG); // Default behavior for SG is identical to AL. fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS. + fID->addAll(*fEB); // Emoji Base and Emoji Modifier behave as ID. + fID->addAll(*fEM); + fAL->removeAll(*fEM); + + + fAL->remove((UChar32)0x2764); // Emoji Proposal: move u2764 from Al to Id + fID->add((UChar32)0x2764); + fSets->addElement(fBK, status); fSets->addElement(fCR, status); fSets->addElement(fLF, status); @@ -3040,10 +3151,12 @@ RBBILineMonkey::RBBILineMonkey() fSets->addElement(fID, status); fSets->addElement(fWJ, status); fSets->addElement(fRI, status); - fSets->addElement(fSA, status); fSets->addElement(fSG, status); + fSets->addElement(fEB, status); + fSets->addElement(fEM, status); + fSets->addElement(fZJ, status); - const char *rules = + const char *rules = "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?" "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?" "\\p{Line_Break=NU}\\p{Line_Break=CM}*" @@ -3228,6 +3341,18 @@ int32_t RBBILineMonkey::next(int32_t startPos) { break; } + // LB 8a ZJ x ID + // The monkey test's way of ignoring combining characters doesn't work + // for this rule. ZJ is also a CM. Need to get the actual character + // preceding "thisChar", not ignoring combining marks, possibly ZJ. + { + int32_t prevIdx = fText->moveIndex32(pos, -1); + UChar32 prevC = fText->char32At(prevIdx); + if (fZJ->contains(prevC) && fID->contains(thisChar)) { + continue; + } + } + // LB 9, 10 Already done, at top of loop. // @@ -3245,7 +3370,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) { if (fGL->contains(prevChar)) { continue; } - + // LB 12a // [^SP BA HY] x GL if (!(fSP->contains(prevChar) || @@ -3368,7 +3493,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) { // LB 21a // HL (HY | BA) x - if (fHL->contains(prevCharX2) && + if (fHL->contains(prevCharX2) && (fHY->contains(prevChar) || fBA->contains(prevChar))) { continue; } @@ -3495,12 +3620,20 @@ int32_t RBBILineMonkey::next(int32_t startPos) { continue; } - // LB30a Do not break between regional indicators. - // RI x RI + // LB30a RI RI RI + // RI x RI + if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) { + break; + } if (fRI->contains(prevChar) && fRI->contains(thisChar)) { continue; } + // LB30b Emoji Base x Emoji Modifier + if (fEB->contains(prevChar) && fEM->contains(thisChar)) { + continue; + } + // LB 31 Break everywhere else break; @@ -3555,9 +3688,10 @@ RBBILineMonkey::~RBBILineMonkey() { delete fHL; delete fID; delete fRI; - delete fSA; delete fSG; - delete fXX; + delete fEB; + delete fEM; + delete fZJ; delete fCharBI; delete fNumberMatcher; @@ -3577,6 +3711,9 @@ RBBILineMonkey::~RBBILineMonkey() { // // type = char | word | line | sent | title // +// Example: +// intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1" +// //------------------------------------------------------------------------------------------- static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) { @@ -3853,7 +3990,6 @@ void RBBITest::TestLineBreaks(void) "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4", "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060", - "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5", "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f", "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1", "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5", @@ -3869,25 +4005,19 @@ void RBBITest::TestLineBreaks(void) "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763", "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029", "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7", - "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc", "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a", "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945", "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014", "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b", "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0", - "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025", "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d", "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111", - "\\u2014\\u0020\\u000a\\u17c5\\u24fc", "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f", "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010", "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43", "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb", "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc", "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060", - "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d" - "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5" - "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b", "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0", "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07", }; @@ -4175,9 +4305,15 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name int32_t charIdx = m_rand() % classSet->size(); UChar32 c = classSet->charAt(charIdx); if (c < 0) { // TODO: deal with sets containing strings. - errln("c < 0"); + errln("%s:%d c < 0", __FILE__, __LINE__); break; } + // Do not assemble a supplementary character from randomly generated separate surrogates. + // (It could be a dictionary character) + if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) { + continue; + } + testText.append(c); } @@ -4284,7 +4420,7 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name } else { if (breakPos >= 0) { precedingBreaks[breakPos] = 1; - } + } lastBreakPos = breakPos; } } @@ -4379,7 +4515,7 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status); charErrorTxt[sizeof(charErrorTxt)-1] = 0; const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status); - + errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s", name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"), errorType, seed, i, charErrorTxt); @@ -4402,15 +4538,15 @@ void RBBITest::TestBug5532(void) { // Text includes a mixture of Thai and Latin. const unsigned char utf8Data[] = { 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u, - 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u, + 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u, 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u, - 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u, - 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu, - 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u, - 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, - 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u, + 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u, + 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu, + 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u, + 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, + 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u, 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00}; UErrorCode status = U_ZERO_ERROR; @@ -4467,7 +4603,7 @@ void RBBITest::TestBug9983(void) { rstatus = brkiter->getRuleStatus(); (void)rstatus; // Suppress set but not used warning. if (iterationCount >= 10) { - break; + break; } } TEST_ASSERT(iterationCount == 6); @@ -4480,7 +4616,7 @@ void RBBITest::TestBug9983(void) { rstatus = brkiterPOSIX->getRuleStatus(); (void)rstatus; // Suppress set but not used warning. if (iterationCount >= 10) { - break; + break; } } TEST_ASSERT(iterationCount == 6); diff --git a/icu4c/source/test/testdata/GraphemeBreakTest.txt b/icu4c/source/test/testdata/GraphemeBreakTest.txt index 61eaf57b7c4..0250ac9442f 100644 --- a/icu4c/source/test/testdata/GraphemeBreakTest.txt +++ b/icu4c/source/test/testdata/GraphemeBreakTest.txt @@ -1,5 +1,6 @@ # GraphemeBreakTest-8.0.0.txt # Date: 2015-02-13, 13:47:15 GMT [MD] +# Hand patched for Emoji breaking proposal L2/16-011R3. # # Unicode Character Database # Copyright (c) 1991-2015 Unicode, Inc. @@ -9,9 +10,9 @@ # Default Grapheme Break Test # # Format: -# (# )? -# contains hex Unicode code points, with -# ÷ wherever there is a break opportunity, and +# (# )? +# contains hex Unicode code points, with +# ÷ wherever there is a break opportunity, and # × wherever there is not. # the format can change, but currently it shows: # - the sample character name @@ -414,10 +415,10 @@ ÷ D800 ÷ 0308 ÷ D800 ÷ # ÷ [0.2] (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (Control) ÷ [0.3] ÷ 0061 ÷ 1F1E6 ÷ 0062 ÷ # ÷ [0.2] LATIN SMALL LETTER A (Other) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) ÷ [999.0] LATIN SMALL LETTER B (Other) ÷ [0.3] ÷ 1F1F7 × 1F1FA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) ÷ [0.3] -÷ 1F1F7 × 1F1FA × 1F1F8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) ÷ [0.3] -÷ 1F1F7 × 1F1FA × 1F1F8 × 1F1EA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER E (Regional_Indicator) ÷ [0.3] +÷ 1F1F7 × 1F1FA ÷ 1F1F8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) ÷ [0.3] +÷ 1F1F7 × 1F1FA ÷ 1F1F8 × 1F1EA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER E (Regional_Indicator) ÷ [0.3] ÷ 1F1F7 × 1F1FA ÷ 200B ÷ 1F1F8 × 1F1EA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) ÷ [5.0] ZERO WIDTH SPACE (Control) ÷ [4.0] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER E (Regional_Indicator) ÷ [0.3] -÷ 1F1E6 × 1F1E7 × 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3] +÷ 1F1E6 × 1F1E7 ÷ 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3] ÷ 1F1E6 × 200D ÷ 1F1E7 × 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [9.0] ZERO WIDTH JOINER (Extend) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3] ÷ 1F1E6 × 1F1E7 × 200D ÷ 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [9.0] ZERO WIDTH JOINER (Extend) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3] ÷ 0020 × 200D ÷ 0646 ÷ # ÷ [0.2] SPACE (Other) × [9.0] ZERO WIDTH JOINER (Extend) ÷ [999.0] ARABIC LETTER NOON (Other) ÷ [0.3] diff --git a/icu4c/source/test/testdata/LineBreakTest.txt b/icu4c/source/test/testdata/LineBreakTest.txt index 19cedf547a6..0bdf5af208b 100644 --- a/icu4c/source/test/testdata/LineBreakTest.txt +++ b/icu4c/source/test/testdata/LineBreakTest.txt @@ -1,5 +1,6 @@ # LineBreakTest-8.0.0.txt # Date: 2015-04-30, 09:40:15 GMT [MD] +# Hand patched for Emoji break proposal L2/16-011R3 # # Unicode Character Database # Copyright (c) 1991-2015 Unicode, Inc. @@ -6324,13 +6325,14 @@ × 3057 × 3001 ÷ 0061 × 0062 ÷ 3068 ÷ # × [0.3] HIRAGANA LETTER SI (ID) × [13.02] IDEOGRAPHIC COMMA (CL) ÷ [999.0] LATIN SMALL LETTER A (AL) × [28.0] LATIN SMALL LETTER B (AL) ÷ [999.0] HIRAGANA LETTER TO (ID) ÷ [0.3] × 0061 ÷ 1F1E6 ÷ 0062 ÷ # × [0.3] LATIN SMALL LETTER A (AL) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER A (RI) ÷ [999.0] LATIN SMALL LETTER B (AL) ÷ [0.3] × 1F1F7 × 1F1FA ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) ÷ [0.3] -× 1F1F7 × 1F1FA × 1F1F8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER S (RI) ÷ [0.3] -× 1F1F7 × 1F1FA × 1F1F8 × 1F1EA ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER S (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER E (RI) ÷ [0.3] +× 1F1F7 × 1F1FA ÷ 1F1F8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER S (RI) ÷ [0.3] +× 1F1F7 × 1F1FA ÷ 1F1F8 × 1F1EA ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER S (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER E (RI) ÷ [0.3] × 1F1F7 × 1F1FA × 200B ÷ 1F1F8 × 1F1EA ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) × [7.02] ZERO WIDTH SPACE (ZW) ÷ [8.0] REGIONAL INDICATOR SYMBOL LETTER S (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER E (RI) ÷ [0.3] × 05D0 × 002D × 05D0 ÷ # × [0.3] HEBREW LETTER ALEF (HL) × [21.02] HYPHEN-MINUS (HY) × [21.1] HEBREW LETTER ALEF (HL) ÷ [0.3] -× 1F1E6 × 1F1E7 × 1F1E8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER A (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER B (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER C (RI) ÷ [0.3] -× 1F1E6 × 200D × 1F1E7 × 1F1E8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER A (RI) × [9.0] ZERO WIDTH JOINER (CM) × [30.11] REGIONAL INDICATOR SYMBOL LETTER B (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER C (RI) ÷ [0.3] -× 1F1E6 × 1F1E7 × 200D × 1F1E8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER A (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER B (RI) × [9.0] ZERO WIDTH JOINER (CM) × [30.11] REGIONAL INDICATOR SYMBOL LETTER C (RI) ÷ [0.3] +× 1F1E6 × 1F1E7 ÷ 1F1E8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER A (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER B (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER C (RI) ÷ [0.3] +# Patched the following two lines for RI pairing. Note ZWJ behaves as CM and logically disappears. +× 1F1E6 × 200D × 1F1E7 ÷ 1F1E8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER A (RI) × [9.0] ZERO WIDTH JOINER (CM) × [30.11] REGIONAL INDICATOR SYMBOL LETTER B (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER C (RI) ÷ [0.3] +× 1F1E6 × 1F1E7 × 200D ÷ 1F1E8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER A (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER B (RI) × [9.0] ZERO WIDTH JOINER (CM) × [30.11] REGIONAL INDICATOR SYMBOL LETTER C (RI) ÷ [0.3] × 0020 ÷ 200D × 0646 ÷ # × [0.3] SPACE (SP) ÷ [18.0] ZERO WIDTH JOINER (CM) × [28.0] ARABIC LETTER NOON (AL) ÷ [0.3] × 0646 × 200D × 0020 ÷ # × [0.3] ARABIC LETTER NOON (AL) × [9.0] ZERO WIDTH JOINER (CM) × [7.01] SPACE (SP) ÷ [0.3] # diff --git a/icu4c/source/test/testdata/WordBreakTest.txt b/icu4c/source/test/testdata/WordBreakTest.txt index 72318b06734..22b6b8e758b 100644 --- a/icu4c/source/test/testdata/WordBreakTest.txt +++ b/icu4c/source/test/testdata/WordBreakTest.txt @@ -1,5 +1,7 @@ # WordBreakTest-8.0.0.txt # Date: 2015-05-02, 14:48:55 GMT [MD] + +# Hand Patched for Emoji breaking proposal L2/16-011R3 # # Unicode Character Database # Copyright (c) 1991-2015 Unicode, Inc. @@ -1392,13 +1394,13 @@ ÷ 2060 ÷ 0043 × 2060 × 002E × 2060 × 0044 × 2060 × 2060 ÷ # ÷ [0.2] WORD JOINER (Format_FE) ÷ [999.0] LATIN CAPITAL LETTER C (ALetter) × [4.0] WORD JOINER (Format_FE) × [6.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [7.0] LATIN CAPITAL LETTER D (ALetter) × [4.0] WORD JOINER (Format_FE) × [4.0] WORD JOINER (Format_FE) ÷ [0.3] ÷ 0061 ÷ 1F1E6 ÷ 0062 ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) ÷ [999.0] LATIN SMALL LETTER B (ALetter) ÷ [0.3] ÷ 1F1F7 × 1F1FA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) ÷ [0.3] -÷ 1F1F7 × 1F1FA × 1F1F8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) ÷ [0.3] -÷ 1F1F7 × 1F1FA × 1F1F8 × 1F1EA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER E (Regional_Indicator) ÷ [0.3] +÷ 1F1F7 × 1F1FA ÷ 1F1F8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) ÷ [0.3] +÷ 1F1F7 × 1F1FA ÷ 1F1F8 × 1F1EA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER E (Regional_Indicator) ÷ [0.3] ÷ 1F1F7 × 1F1FA ÷ 200B ÷ 1F1F8 × 1F1EA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) ÷ [999.0] ZERO WIDTH SPACE (Other) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER E (Regional_Indicator) ÷ [0.3] ÷ 05D0 × 0022 × 05D0 ÷ # ÷ [0.2] HEBREW LETTER ALEF (Hebrew_Letter) × [7.2] QUOTATION MARK (Double_Quote) × [7.3] HEBREW LETTER ALEF (Hebrew_Letter) ÷ [0.3] -÷ 1F1E6 × 1F1E7 × 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3] -÷ 1F1E6 × 200D × 1F1E7 × 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [4.0] ZERO WIDTH JOINER (Extend_FE) × [13.3] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3] -÷ 1F1E6 × 1F1E7 × 200D × 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [4.0] ZERO WIDTH JOINER (Extend_FE) × [13.3] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3] +÷ 1F1E6 × 1F1E7 ÷ 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3] +÷ 1F1E6 × 200D × 1F1E7 ÷ 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [4.0] ZERO WIDTH JOINER (Extend_FE) × [13.3] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3] +÷ 1F1E6 × 1F1E7 × 200D ÷ 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [4.0] ZERO WIDTH JOINER (Extend_FE) × [13.3] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3] ÷ 0020 × 200D ÷ 0646 ÷ # ÷ [0.2] SPACE (Other) × [4.0] ZERO WIDTH JOINER (Extend_FE) ÷ [999.0] ARABIC LETTER NOON (ALetter) ÷ [0.3] ÷ 0646 × 200D ÷ 0020 ÷ # ÷ [0.2] ARABIC LETTER NOON (ALetter) × [4.0] ZERO WIDTH JOINER (Extend_FE) ÷ [999.0] SPACE (Other) ÷ [0.3] ÷ 0031 ÷ 003A ÷ 003A ÷ 0031 ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) ÷ [999.0] COLON (MidLetter) ÷ [999.0] DIGIT ONE (Numeric) ÷ [0.3] diff --git a/icu4c/source/test/testdata/break_rules/grapheme.txt b/icu4c/source/test/testdata/break_rules/grapheme.txt new file mode 100644 index 00000000000..4eba9a3c3f2 --- /dev/null +++ b/icu4c/source/test/testdata/break_rules/grapheme.txt @@ -0,0 +1,60 @@ +# +# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved. + +# file: grapheme.txt +# +# Reference Grapheme Break rules for intltest rbbi/RBBIMonkeyTest +# +# +# Note: Rule syntax and the monkey test itself are still a work in progress. +# They are expected to change with review and the addition of support for rule tailoring. + +type = grapheme; # one of grapheme | word | line | sentence +locale = en; + +CR = [\u000d]; +LF = [\u000a]; + +Control = [[\p{Grapheme_Cluster_Break = Control}]-[:Block=Tags:]]; +Extend = [[\p{Grapheme_Cluster_Break = Extend}][:Block=Tags:]]; +Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}]; +Prepend = []; +SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}]; +E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918]; +E_Modifier = [\U0001F3FB-\U0001F3FF]; +GAZ = [\U0001F455-\U0001F469\U0001F48B\U0001F5E8\u2764]; +ZWJ = [\u200D]; + +# +# Korean Syllable Definitions +# +L = [\p{Grapheme_Cluster_Break = L}]; +V = [\p{Grapheme_Cluster_Break = V}]; +T = [\p{Grapheme_Cluster_Break = T}]; + +LV = [\p{Grapheme_Cluster_Break = LV}]; +LVT = [\p{Grapheme_Cluster_Break = LVT}]; + +GB3: CR LF; +GB4: (Control | CR | LF) ÷; +GB5: . ÷ (Control | CR | LF); + +GB6: L (L | V | LV | LVT); +GB7: (LV | V) (V | T); +GB8: (LVT | T) T; + +# Regional Indicators, split into pairs. +# Note that a pair of RIs that is not followed by a third RI will fall into +# the normal rules for Extend, etc. +# +GB8a.1: Regional_Indicator Regional_Indicator ÷ Regional_Indicator; +GB8a.2: Regional_Indicator Regional_Indicator; + +GB9: . Extend; + +GB9a: . SpacingMark; +GB9b: Prepend .; +GB9c: (E_Base | GAZ) E_Modifier; +GB9d: ZWJ GAZ; + +GB10: . ÷; diff --git a/icu4c/source/test/testdata/break_rules/line.txt b/icu4c/source/test/testdata/break_rules/line.txt new file mode 100644 index 00000000000..8823affb07c --- /dev/null +++ b/icu4c/source/test/testdata/break_rules/line.txt @@ -0,0 +1,196 @@ +# +# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved. + +# file: line.txt +# +# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest +# +# Note: Rule syntax and the monkey test itself are still a work in progress. +# They are expected to change with review and the addition of support for rule tailoring. + + +type = line; +locale = en; + + +AI = [:LineBreak = Ambiguous:]; +AL = [[:LineBreak = Alphabetic:]-[\u2764]]; +BA = [:LineBreak = Break_After:]; +BB = [:LineBreak = Break_Before:]; +BK = [:LineBreak = Mandatory_Break:]; +B2 = [:LineBreak = Break_Both:]; +CB = [:LineBreak = Contingent_Break:]; +CJ = [:LineBreak = Conditional_Japanese_Starter:]; +CL = [:LineBreak = Close_Punctuation:]; +CM = [:LineBreak = Combining_Mark:]; +CP = [:LineBreak = Close_Parenthesis:]; +CR = [:LineBreak = Carriage_Return:]; + +EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918]; +EM = [\U0001F3FB-\U0001F3FF]; + +EX = [:LineBreak = Exclamation:]; +GL = [:LineBreak = Glue:]; +HL = [:LineBreak = Hebrew_Letter:]; +HY = [:LineBreak = Hyphen:]; +H2 = [:LineBreak = H2:]; +H3 = [:LineBreak = H3:]; +ID = [[:LineBreak = Ideographic:][\u2764]]; +IN = [:LineBreak = Inseperable:]; +IS = [:LineBreak = Infix_Numeric:]; +JL = [:LineBreak = JL:]; +JV = [:LineBreak = JV:]; +JT = [:LineBreak = JT:]; +LF = [:LineBreak = Line_Feed:]; +NL = [:LineBreak = Next_Line:]; +NS = [[:LineBreak = Nonstarter:] CJ]; +NU = [:LineBreak = Numeric:]; +OP = [:LineBreak = Open_Punctuation:]; +PO = [:LineBreak = Postfix_Numeric:]; +PR = [:LineBreak = Prefix_Numeric:]; +QU = [:LineBreak = Quotation:]; +RI = [:LineBreak = Regional_Indicator:]; +SA = [:LineBreak = Complex_Context:]; +SG = [:LineBreak = Surrogate:]; +SP = [:LineBreak = Space:]; +SY = [:LineBreak = Break_Symbols:]; +WJ = [:LineBreak = Word_Joiner:]; +XX = [:LineBreak = Unknown:]; +ZW = [:LineBreak = ZWSpace:]; +ZJ = [\u200D]; + +# TODO: adjustment to sets needed only until Unicode properties are updated for Emoji. +ID = [ID - EB]; +AL = [AL - EM]; + +dictionary = [:LineBreak = Complex_Context:]; + +# Redfine AL. LB1. TODO: refine according to latest UAX. +AL = [ AL AI SA SG XX ]; + +LB4: BK ÷; +LB5: CR LF; +LB5.1: CR ÷; +LB5.2: LF ÷; +LB5.3: NL ÷; + +LB6: . (BK | CR | LF | NL); +LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL); + +# Rules LB14 - LB17. +# Moved before LB7, because they can match a longer sequence that would also match LB7, +# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it, +# "while only the prefix "OP CM SP" matches LB7.1 +LB14: OP CM* SP* .; +LB15: QU CM* SP* OP; +LB16: (CL | CP)CM* SP* NS; +LB17: B2 CM* SP* B2; + +LB7.1: [^ZW SP] CM* [SP ZW]; +LB7.2: [ZW SP] [SP ZW]; + +# LB8, ICU differs from UAX-14, +# ICU: ZW ÷; +# UAX 14: ZW SP* ÷; +LB8: ZW ÷; + +# LB8a, from Emoji proposal L2/16-011R3 +# ZWJ x ID +LB8a: ZJ (ID | EB | EM); + + +# LB9: X CM -> X +# LB10: Unattached CM -> AL + +#LB11: × WJ; +# WJ × + +LB11.1: [^BK CR LF NL SP ZW] CM* WJ; +LB11.2: SP WJ; +LB11.3: WJ CM* [^CM]; + +LB12: GL CM* [^CM]; + +LB12a: [^SP BA HY] CM* GL; + +# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14. +# +# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule. +# LB13.2 SP CM* [CL CP EX IS SY] + +LB13.1: [^NU SP] CM* [CL CP IS SY]; +LB13.2: [^SP] CM* EX; +LB13.2: SP [CL CP EX IS SY]; + + +# LB 14-17 are moved above LB 7. + +LB18: SP ÷; + +LB19: . CM* QU; +LB19.1: QU CM* [^CM]; + +# LB 20 Break before and after CB. +# Interaction with LB8a: ZJ x ID is tricky because CM includes ZJ. +# ZJ acts like a CM to the left, combining with CB. +# ZJ acts independently to the right, no break from ID by LB8a. +LB20: . CM* ÷ CB; +LB20.1a: CB CM* ZJ (ID | EB | EM); +LB20.1b: CB CM* ÷; + +# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then +# not picking up the continuing match after the BA from 21a. +LB21a: HL CM* (HY | BA) CM* [^CM CB]; + +LB21.1: . CM* [BA HY NS]; +LB21.2: BB CM* [^CM CB]; + +LB21b: SY CM* HL; + +LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL. +LB22.2: EX CM* IN; +LB22.3: (ID | EB | EM) CM* IN; +LB22.4: IN CM* IN; +LB22.5: NU CM* IN; + +LB23.1: (ID | EB | EM) CM* PO; +LB23.2: (AL | HL | CM) CM* NU; +LB23.3: NU CM* (AL | HL); + +LB24.1: PR CM* (ID | EB | EM); +LB24.2: PR CM* (AL | HL); +LB24.3: PO CM* (AL | HL); + +# Numbers. Equivalent to Tailoring example 8 from UAX 14. +LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?; + +LB26.1: JL CM* (JL | JV | H2 | H3); +LB26.2: (JV | H2) CM* (JV | JT); +LB26.3: (JT | H3) CM* JT; + +LB27.1: (JL | JV | JT | H2 | H3) CM* IN; +LB27.2: (JL | JV | JT | H2 | H3) CM* PO; +LB27.3: PR CM* (JL | JV | JT | H2 | H3); + +# LB28 Do not break between Alphabetics. +# Unattached (leading) CM treated as AL. +LB28: (AL | HL | CM)CM* (AL | HL); + +LB29: IS CM* (AL | HL); + +# LB30 is adjusted for unattached leading CM being treated as AL. +LB30.1: (AL | CM | HL | NU) CM* OP; +LB30.2: CP CM* (AL | HL | NU); + +# LB31 keep pairs of RI together. +LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS]; +LB30a.2: RI CM* RI CM* ZJ (ID | EB | EM); +LB30a.3: RI CM* RI CM* ÷; + +# LB30b Do not break between Emoji Base and Emoji Modifier +LB30b: EB CM* EM; + +# LB31 Break Everywhere Else. +# Include combining marks +LB31.1: . CM* ZJ (ID | EB | EM); +LB31.2: . CM* ÷; diff --git a/icu4c/source/test/testdata/break_rules/line_loose.txt b/icu4c/source/test/testdata/break_rules/line_loose.txt new file mode 100644 index 00000000000..dcf6459bf31 --- /dev/null +++ b/icu4c/source/test/testdata/break_rules/line_loose.txt @@ -0,0 +1,204 @@ +# +# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved. +# +# file: line_loose.txt +# +# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest +# +# Note: Rule syntax and the monkey test itself are still a work in progress. +# They are expected to change with review and the addition of support for rule tailoring. +# +# This tailors the line break behavior to correspond to CSS +# line-break=loose (BCP47 -u-lb-loose) as defined for languages other than +# Chinese & Japanese. +# It sets characters of class CJ to behave like ID. +# In addition, it allows breaks: +# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS) +# * between characters of LineBreak class IN + +type = line; +locale = en@lb=loose; + + +AI = [:LineBreak = Ambiguous:]; +AL = [[:LineBreak = Alphabetic:]-[\u2764]]; +BA = [:LineBreak = Break_After:]; +BB = [:LineBreak = Break_Before:]; +BK = [:LineBreak = Mandatory_Break:]; +B2 = [:LineBreak = Break_Both:]; +CB = [:LineBreak = Contingent_Break:]; +CJ = [:LineBreak = Conditional_Japanese_Starter:]; +CL = [:LineBreak = Close_Punctuation:]; +CM = [:LineBreak = Combining_Mark:]; +CP = [:LineBreak = Close_Parenthesis:]; +CR = [:LineBreak = Carriage_Return:]; + +EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918]; +EM = [\U0001F3FB-\U0001F3FF]; + +EX = [:LineBreak = Exclamation:]; +GL = [:LineBreak = Glue:]; +HL = [:LineBreak = Hebrew_Letter:]; +HY = [:LineBreak = Hyphen:]; +H2 = [:LineBreak = H2:]; +H3 = [:LineBreak = H3:]; +ID = [[:LineBreak = Ideographic:] CJ [\u2764]]; +IN = [:LineBreak = Inseperable:]; +IS = [:LineBreak = Infix_Numeric:]; +JL = [:LineBreak = JL:]; +JV = [:LineBreak = JV:]; +JT = [:LineBreak = JT:]; +LF = [:LineBreak = Line_Feed:]; +NL = [:LineBreak = Next_Line:]; +NSX = [\u3005 \u303B \u309D \u309E \u30FD \u30FE]; +NS = [[:LineBreak = Nonstarter:] - NSX]; +NU = [:LineBreak = Numeric:]; +OP = [:LineBreak = Open_Punctuation:]; +PO = [:LineBreak = Postfix_Numeric:]; +PR = [:LineBreak = Prefix_Numeric:]; +QU = [:LineBreak = Quotation:]; +RI = [:LineBreak = Regional_Indicator:]; +SA = [:LineBreak = Complex_Context:]; +SG = [:LineBreak = Surrogate:]; +SP = [:LineBreak = Space:]; +SY = [:LineBreak = Break_Symbols:]; +WJ = [:LineBreak = Word_Joiner:]; +XX = [:LineBreak = Unknown:]; +ZW = [:LineBreak = ZWSpace:]; +ZJ = [\u200D]; + +# TODO: adjustment to sets needed only until Unicode properties are updated for Emoji. +ID = [ID - EB]; +AL = [AL - EM]; + +dictionary = [:LineBreak = Complex_Context:]; + +# Redfine AL. LB1. TODO: refine according to latest UAX. +AL = [ AL AI SA SG XX ]; + +LB4: BK ÷; +LB5: CR LF; +LB5.1: CR ÷; +LB5.2: LF ÷; +LB5.3: NL ÷; + +LB6: . (BK | CR | LF | NL); +LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL); + +# Rules LB14 - LB17. +# Moved before LB7, because they can match a longer sequence that would also match LB7, +# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it, +# "while only the prefix "OP CM SP" matches LB7.1 +LB14: OP CM* SP* .; +LB15: QU CM* SP* OP; +LB16: (CL | CP)CM* SP* NS; +LB17: B2 CM* SP* B2; + +LB7.1: [^ZW SP] CM* [SP ZW]; +LB7.2: [ZW SP] [SP ZW]; + +# LB8, ICU differs from UAX-14, +# ICU: ZW ÷; +# UAX 14: ZW SP* ÷; +LB8: ZW ÷; + +# LB8a, from Emoji proposal L2/16-011R3 +# ZWJ x ID +LB8a: ZJ (ID | EB | EM); + + +# LB9: X CM -> X +# LB10: Unattached CM -> AL + +#LB11: × WJ; +# WJ × + +LB11.1: [^BK CR LF NL SP ZW] CM* WJ; +LB11.2: SP WJ; +LB11.3: WJ CM* [^CM]; + +LB12: GL CM* [^CM]; + +LB12a: [^SP BA HY] CM* GL; + +# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14. +# +# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule. +# LB13.2 SP CM* [CL CP EX IS SY] + +LB13.1: [^NU SP] CM* [CL CP IS SY]; +LB13.2: [^SP] CM* EX; +LB13.2: SP [CL CP EX IS SY]; + + +# LB 14-17 are moved above LB 7. + +LB18: SP ÷; + +LB19: . CM* QU; +LB19.1: QU CM* [^CM]; + +# LB 20 Break before and after CB. +# Interaction with LB8a: ZJ x ID is tricky because CM includes ZJ. +# ZJ acts like a CM to the left, combining with CB. +# ZJ acts independently to the right, no break from ID by LB8a. +LB20: . CM* ÷ CB; +LB20.1a: CB CM* ZJ (ID | EB | EM); +LB20.1b: CB CM* ÷; + +# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then +# not picking up the continuing match after the BA from 21a. +LB21a: HL CM* (HY | BA) CM* [^CM CB]; + +LB21.1: . CM* [BA HY NS]; +LB21.2: BB CM* [^CM CB]; + +LB21b: SY CM* HL; + +LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL. +LB22.2: EX CM* IN; +LB22.3: (ID | EB | EM) CM* IN; +# LB22.4: IN CM* IN; # delete this rule for CSS loose. +LB22.5: NU CM* IN; + +LB23.1: (ID | EB | EM) CM* PO; +LB23.2: (AL | HL | CM) CM* NU; +LB23.3: NU CM* (AL | HL); + +LB24.1: PR CM* (ID | EB | EM); +LB24.2: PR CM* (AL | HL); +LB24.3: PO CM* (AL | HL); + +# Numbers. Equivalent to Tailoring example 8 from UAx 14. +LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?; + +LB26.1: JL CM* (JL | JV | H2 | H3); +LB26.2: (JV | H2) CM* (JV | JT); +LB26.3: (JT | H3) CM* JT; + +LB27.1: (JL | JV | JT | H2 | H3) CM* IN; +LB27.2: (JL | JV | JT | H2 | H3) CM* PO; +LB27.3: PR CM* (JL | JV | JT | H2 | H3); + +# LB28 Do not break between Alphabetics. +# Unattached (leading) CM treated as AL. +LB28: (AL | HL | CM)CM* (AL | HL); + +LB29: IS CM* (AL | HL); + +# LB30 is adjusted for unattached leading CM being treated as AL. +LB30.1: (AL | CM | HL | NU) CM* OP; +LB30.2: CP CM* (AL | HL | NU); + +# LB31 keep pairs of RI together. +LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS]; +LB30a.2: RI CM* RI CM* ZJ (ID | EB | EM); +LB30a.3: RI CM* RI CM* ÷; + +# LB30b Do not break between Emoji Base and Emoji Modifier +LB30b: EB CM* EM; + +# LB31 Break Everywhere Else. +# Include combining marks +LB31.1: . CM* ZJ (ID | EB | EM); +LB31.2: . CM* ÷; diff --git a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt new file mode 100644 index 00000000000..bd6a6847b63 --- /dev/null +++ b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt @@ -0,0 +1,225 @@ +# +# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved. +# +# file: line_loose_cj.txt +# +# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest +# +# Note: Rule syntax and the monkey test itself are still a work in progress. +# They are expected to change with review and the addition of support for rule tailoring. +# +# Line Breaking Rules +# Implement default line breaking as defined by +# Unicode Standard Annex #14 Revision 34 for Unicode 8.0 +# http://www.unicode.org/reports/tr14/ +# tailored as noted in 2nd paragraph below.. +# +# This tailors the line break behavior to correspond to CSS +# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese. +# It sets characters of class CJ to behave like ID. +# In addition, it allows breaks: +# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS) +# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS) +# * between characters of LineBreak class IN such as 2026 +# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B, +# FF65 (all NS) and FF01, FF1F (both EX). +# * before suffix characters with LineBreak class PO and EastAsianWidth A,F,W; +# this includes: 00B0 2030 2032 2033 2035 2103 2109 FE6A FF05 FFE0 +# * after prefix characters with LineBreak class PR and EastAsianWidth A,F,W; +# this includes: 00A4 00B1 20AC 2116 FE69 FF04 FFE1 FFE5 FFE6 + + +type = line; +locale = ja@lb=loose; + + +AI = [:LineBreak = Ambiguous:]; +AL = [[:LineBreak = Alphabetic:]-[\u2764]]; +BAX = [\u2010 \u2013]; +BA = [[:LineBreak = Break_After:] - BAX]; +BB = [:LineBreak = Break_Before:]; +BK = [:LineBreak = Mandatory_Break:]; +B2 = [:LineBreak = Break_Both:]; +CB = [:LineBreak = Contingent_Break:]; +CJ = [:LineBreak = Conditional_Japanese_Starter:]; +CL = [:LineBreak = Close_Punctuation:]; +CM = [:LineBreak = Combining_Mark:]; +CP = [:LineBreak = Close_Parenthesis:]; +CR = [:LineBreak = Carriage_Return:]; + +EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918]; +EM = [\U0001F3FB-\U0001F3FF]; + +EXX = [\uFF01 \uFF1F]; +EX = [[:LineBreak = Exclamation:] - EXX]; +GL = [:LineBreak = Glue:]; +HL = [:LineBreak = Hebrew_Letter:]; +HY = [:LineBreak = Hyphen:]; +H2 = [:LineBreak = H2:]; +H3 = [:LineBreak = H3:]; +ID = [[:LineBreak = Ideographic:][\u2764]CJ]; +IN = [:LineBreak = Inseperable:]; +IS = [:LineBreak = Infix_Numeric:]; +JL = [:LineBreak = JL:]; +JV = [:LineBreak = JV:]; +JT = [:LineBreak = JT:]; +LF = [:LineBreak = Line_Feed:]; +NL = [:LineBreak = Next_Line:]; +NSX = [\u301C \u30A0 \u3005 \u303B \u309D \u309E \u30FD \u30FE \u203C \u2047 \u2048 \u2049 \u30FB \uFF1A \uFF1B \uFF65]; +NS = [[:LineBreak = Nonstarter:] - NSX]; +NU = [:LineBreak = Numeric:]; +OP = [:LineBreak = Open_Punctuation:]; +POX = [\u00B0 \u2030 \u2032 \u2033 \u2035 \u2103 \u2109 \uFE6A \uFF05 \uFFE0]; +PO = [[:LineBreak = Postfix_Numeric:] - POX]; +PRX = [\u00A4 \u00B1 \u20AC \u2116 \uFE69 \uFF04 \uFFE1 \uFFE5 \uFFE6]; +PR = [[:LineBreak = Prefix_Numeric:] - PRX]; +QU = [:LineBreak = Quotation:]; +RI = [:LineBreak = Regional_Indicator:]; +SA = [:LineBreak = Complex_Context:]; +SG = [:LineBreak = Surrogate:]; +SP = [:LineBreak = Space:]; +SY = [:LineBreak = Break_Symbols:]; +WJ = [:LineBreak = Word_Joiner:]; +XX = [:LineBreak = Unknown:]; +ZW = [:LineBreak = ZWSpace:]; +ZJ = [\u200D]; + +# TODO: adjustment to sets needed only until Unicode properties are updated for Emoji. +ID = [ID - EB]; +AL = [AL - EM]; + +dictionary = [:LineBreak = Complex_Context:]; + +# Redfine AL. LB1. TODO: refine according to latest UAX. +AL = [ AL AI SA SG XX ]; + +LB4: BK ÷; +LB5: CR LF; +LB5.1: CR ÷; +LB5.2: LF ÷; +LB5.3: NL ÷; + +LB6: . (BK | CR | LF | NL); +LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL); + +# Rules LB14 - LB17. +# Moved before LB7, because they can match a longer sequence that would also match LB7, +# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it, +# "while only the prefix "OP CM SP" matches LB7.1 +LB14: OP CM* SP* .; +LB15: QU CM* SP* OP; +LB16: (CL | CP)CM* SP* NS; +LB17: B2 CM* SP* B2; + +LB7.1: [^ZW SP] CM* [SP ZW]; +LB7.2: [ZW SP] [SP ZW]; + +# LB8, ICU differs from UAX-14, +# ICU: ZW ÷; +# UAX 14: ZW SP* ÷; +LB8: ZW ÷; + +# LB8a, from Emoji proposal L2/16-011R3 +# ZWJ x ID +LB8a: ZJ (ID | EB | EM); + + +# LB9: X CM -> X +# LB10: Unattached CM -> AL + +#LB11: × WJ; +# WJ × + +LB11.1: [^BK CR LF NL SP ZW] CM* WJ; +LB11.2: SP WJ; +LB11.3: WJ CM* [^CM]; + +LB12: GL CM* [^CM]; + +LB12a: [^SP BA BAX HY] CM* GL; + +# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14. +# +# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule. +# LB13.2 SP CM* [CL CP EX IS SY] + +LB13.1: [^NU SP] CM* [CL CP IS SY]; +LB13.2: [^SP] CM* EX; +LB13.2: SP [CL CP EX IS SY]; + + +# LB 14-17 are moved above LB 7. + +LB18: SP ÷; + +LB19: . CM* QU; +LB19.1: QU CM* [^CM]; + +# LB 20 Break before and after CB. +# Interaction with LB8a: ZJ x ID is tricky because CM includes ZJ. +# ZJ acts like a CM to the left, combining with CB. +# ZJ acts independently to the right, no break from ID by LB8a. +LB20: . CM* ÷ CB; +LB20.1a: CB CM* ZJ (ID | EB | EM); +LB20.1b: CB CM* ÷; + +# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then +# not picking up the continuing match after the BA from 21a. +# LB 21a Don't break after Hebrew + Hyphen +# HL (HY | BA) x + +LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?; + +LB21.1: . CM* [BA HY NS]; +LB21.2: BB CM* [^CM CB]; + +LB21b: SY CM* HL; + +LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL. +LB22.2: EX CM* IN; +LB22.3: (ID | EB | EM) CM* IN; +# LB22.4: IN CM* IN; # delete this rule for CSS loose. +LB22.5: NU CM* IN; + +LB23.1: (ID | EB | EM) CM* PO; +LB23.2: (AL | HL | CM) CM* NU; +LB23.3: NU CM* (AL | HL); + +LB24.1: PR CM* (ID | EB | EM); +LB24.2: PR CM* (AL | HL); +LB24.3: (PO | POX) CM* (AL | HL); + +# Numbers. Equivalent to Tailoring example 8 from UAx 14. +# Loose_cj tailoring: do not include $PRX at the beginning or $POX at the end. +LB25: ((PR | PO | POX)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PRX | PO))?; + +LB26.1: JL CM* (JL | JV | H2 | H3); +LB26.2: (JV | H2) CM* (JV | JT); +LB26.3: (JT | H3) CM* JT; + +LB27.1: (JL | JV | JT | H2 | H3) CM* IN; +LB27.2: (JL | JV | JT | H2 | H3) CM* PO; +LB27.3: PR CM* (JL | JV | JT | H2 | H3); + +# LB28 Do not break between Alphabetics. +# Unattached (leading) CM treated as AL. +LB28: (AL | HL | CM)CM* (AL | HL); + +LB29: IS CM* (AL | HL); + +# LB30 is adjusted for unattached leading CM being treated as AL. +LB30.1: (AL | CM | HL | NU) CM* OP; +LB30.2: CP CM* (AL | HL | NU); + +# LB31 keep pairs of RI together. +LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS]; +LB30a.2: RI CM* RI CM* ZJ (ID | EB | EM); +LB30a.3: RI CM* RI CM* ÷; + +# LB30b Do not break between Emoji Base and Emoji Modifier +LB30b: EB CM* EM; + +# LB31 Break Everywhere Else. +# Include combining marks +LB31.1: . CM* ZJ (ID | EB | EM); +LB31.2: . CM* ÷; diff --git a/icu4c/source/test/testdata/break_rules/line_normal.txt b/icu4c/source/test/testdata/break_rules/line_normal.txt new file mode 100644 index 00000000000..74499a36be4 --- /dev/null +++ b/icu4c/source/test/testdata/break_rules/line_normal.txt @@ -0,0 +1,210 @@ +# +# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved. +# +# file: line_normal.txt +# +# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest +# +# Note: Rule syntax and the monkey test itself are still a work in progress. +# They are expected to change with review and the addition of support for rule tailoring. +# +# Line Breaking Rules +# Implement default line breaking as defined by +# Unicode Standard Annex #14 Revision 34 for Unicode 8.0 +# http://www.unicode.org/reports/tr14/ +# tailored as noted in 2nd paragraph below.. +# +# TODO: Rule LB 8 remains as it was in Unicode 5.2 +# This is only because of a limitation of ICU break engine implementation, +# not because the older behavior is desirable. +# +# This tailors the line break behavior to correspond to CSS +# line-break=normal (BCP47 -u-lb-normal) as defined for languages other than +# Chinese & Japanese. +# It sets characters of class CJ to behave like ID. + + +type = line; +locale = en@lb=normal; + +AI = [:LineBreak = Ambiguous:]; +AL = [[:LineBreak = Alphabetic:]-[\u2764]]; +BA = [:LineBreak = Break_After:]; +BB = [:LineBreak = Break_Before:]; +BK = [:LineBreak = Mandatory_Break:]; +B2 = [:LineBreak = Break_Both:]; +CB = [:LineBreak = Contingent_Break:]; +CJ = [:LineBreak = Conditional_Japanese_Starter:]; +CL = [:LineBreak = Close_Punctuation:]; +CM = [:LineBreak = Combining_Mark:]; +CP = [:LineBreak = Close_Parenthesis:]; +CR = [:LineBreak = Carriage_Return:]; + +EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918]; +EM = [\U0001F3FB-\U0001F3FF]; + +EX = [:LineBreak = Exclamation:]; +GL = [:LineBreak = Glue:]; +HL = [:LineBreak = Hebrew_Letter:]; +HY = [:LineBreak = Hyphen:]; +H2 = [:LineBreak = H2:]; +H3 = [:LineBreak = H3:]; +ID = [[:LineBreak = Ideographic:] CJ [\u2764]]; +IN = [:LineBreak = Inseperable:]; +IS = [:LineBreak = Infix_Numeric:]; +JL = [:LineBreak = JL:]; +JV = [:LineBreak = JV:]; +JT = [:LineBreak = JT:]; +LF = [:LineBreak = Line_Feed:]; +NL = [:LineBreak = Next_Line:]; +NS = [:LineBreak = Nonstarter:]; +NU = [:LineBreak = Numeric:]; +OP = [:LineBreak = Open_Punctuation:]; +PO = [:LineBreak = Postfix_Numeric:]; +PR = [:LineBreak = Prefix_Numeric:]; +QU = [:LineBreak = Quotation:]; +RI = [:LineBreak = Regional_Indicator:]; +SA = [:LineBreak = Complex_Context:]; +SG = [:LineBreak = Surrogate:]; +SP = [:LineBreak = Space:]; +SY = [:LineBreak = Break_Symbols:]; +WJ = [:LineBreak = Word_Joiner:]; +XX = [:LineBreak = Unknown:]; +ZW = [:LineBreak = ZWSpace:]; +ZJ = [\u200D]; + +# TODO: adjustment to sets needed only until Unicode properties are updated for Emoji. +ID = [ID - EB]; +AL = [AL - EM]; + +dictionary = [:LineBreak = Complex_Context:]; + +# Redfine AL. LB1. TODO: refine according to latest UAX. +AL = [ AL AI SA SG XX ]; + +LB4: BK ÷; +LB5: CR LF; +LB5.1: CR ÷; +LB5.2: LF ÷; +LB5.3: NL ÷; + +LB6: . (BK | CR | LF | NL); +LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL); + +# Rules LB14 - LB17. +# Moved before LB7, because they can match a longer sequence that would also match LB7, +# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it, +# "while only the prefix "OP CM SP" matches LB7.1 +LB14: OP CM* SP* .; +LB15: QU CM* SP* OP; +LB16: (CL | CP)CM* SP* NS; +LB17: B2 CM* SP* B2; + +LB7.1: [^ZW SP] CM* [SP ZW]; +LB7.2: [ZW SP] [SP ZW]; + +# LB8, ICU differs from UAX-14, +# ICU: ZW ÷; +# UAX 14: ZW SP* ÷; +LB8: ZW ÷; + +# LB8a, from Emoji proposal L2/16-011R3 +# ZWJ x ID +LB8a: ZJ (ID | EB | EM); + + +# LB9: X CM -> X +# LB10: Unattached CM -> AL + +#LB11: × WJ; +# WJ × + +LB11.1: [^BK CR LF NL SP ZW] CM* WJ; +LB11.2: SP WJ; +LB11.3: WJ CM* [^CM]; + +LB12: GL CM* [^CM]; + +LB12a: [^SP BA HY] CM* GL; + +# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14. +# +# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule. +# LB13.2 SP CM* [CL CP EX IS SY] + +LB13.1: [^NU SP] CM* [CL CP IS SY]; +LB13.2: [^SP] CM* EX; +LB13.2: SP [CL CP EX IS SY]; + + +# LB 14-17 are moved above LB 7. + +LB18: SP ÷; + +LB19: . CM* QU; +LB19.1: QU CM* [^CM]; + +# LB 20 Break before and after CB. +# Interaction with LB8a: ZJ x ID is tricky because CM includes ZJ. +# ZJ acts like a CM to the left, combining with CB. +# ZJ acts independently to the right, no break from ID by LB8a. +LB20: . CM* ÷ CB; +LB20.1a: CB CM* ZJ (ID | EB | EM); +LB20.1b: CB CM* ÷; + +# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then +# not picking up the continuing match after the BA from 21a. +LB21a: HL CM* (HY | BA) CM* [^CM CB]; + +LB21.1: . CM* [BA HY NS]; +LB21.2: BB CM* [^CM CB]; + +LB21b: SY CM* HL; + +LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL. +LB22.2: EX CM* IN; +LB22.3: (ID | EB | EM) CM* IN; +LB22.4: IN CM* IN; +LB22.5: NU CM* IN; + +LB23.1: (ID | EB | EM) CM* PO; +LB23.2: (AL | HL | CM) CM* NU; +LB23.3: NU CM* (AL | HL); + +LB24.1: PR CM* (ID | EB | EM); +LB24.2: PR CM* (AL | HL); +LB24.3: PO CM* (AL | HL); + +# Numbers. Equivalent to Tailoring example 8 from UAx 14. +LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?; + +LB26.1: JL CM* (JL | JV | H2 | H3); +LB26.2: (JV | H2) CM* (JV | JT); +LB26.3: (JT | H3) CM* JT; + +LB27.1: (JL | JV | JT | H2 | H3) CM* IN; +LB27.2: (JL | JV | JT | H2 | H3) CM* PO; +LB27.3: PR CM* (JL | JV | JT | H2 | H3); + +# LB28 Do not break between Alphabetics. +# Unattached (leading) CM treated as AL. +LB28: (AL | HL | CM)CM* (AL | HL); + +LB29: IS CM* (AL | HL); + +# LB30 is adjusted for unattached leading CM being treated as AL. +LB30.1: (AL | CM | HL | NU) CM* OP; +LB30.2: CP CM* (AL | HL | NU); + +# LB31 keep pairs of RI together. +LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS]; +LB30a.2: RI CM* RI CM* ZJ (ID | EB | EM); +LB30a.3: RI CM* RI CM* ÷; + +# LB30b Do not break between Emoji Base and Emoji Modifier +LB30b: EB CM* EM; + +# LB31 Break Everywhere Else. +# Include combining marks +LB31.1: . CM* ZJ (ID | EB | EM); +LB31.2: . CM* ÷; diff --git a/icu4c/source/test/testdata/break_rules/line_normal_cj.txt b/icu4c/source/test/testdata/break_rules/line_normal_cj.txt new file mode 100644 index 00000000000..6bff8d9b338 --- /dev/null +++ b/icu4c/source/test/testdata/break_rules/line_normal_cj.txt @@ -0,0 +1,218 @@ +# Copyright (c) 2016 International Business Machines Corporation and # others. All Rights Reserved. +# +# file: line_normal_cj.txt +# +# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest +# +# Note: Rule syntax and the monkey test itself are still a work in progress. +# They are expected to change with review and the addition of support for rule tailoring. +# +# Line Breaking Rules +# Implement default line breaking as defined by +# Unicode Standard Annex #14 Revision 34 for Unicode 8.0 +# http://www.unicode.org/reports/tr14/ +# tailored as noted in 2nd paragraph below.. +# +# TODO: Rule LB 8 remains as it was in Unicode 5.2 +# This is only because of a limitation of ICU break engine implementation, +# not because the older behavior is desirable. +# +# This tailors the line break behavior to correspond to CSS +# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese. +# It sets characters of class CJ to behave like ID. +# In addition, it allows breaks: +# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS) + +type = line; +locale = ja@lb=normal; + +AI = [:LineBreak = Ambiguous:]; +AL = [[:LineBreak = Alphabetic:]-[\u2764]]; +BAX = [\u2010 \u2013]; +BA = [[:LineBreak = Break_After:] - BAX]; +BB = [:LineBreak = Break_Before:]; +BK = [:LineBreak = Mandatory_Break:]; +B2 = [:LineBreak = Break_Both:]; +CB = [:LineBreak = Contingent_Break:]; +CJ = [:LineBreak = Conditional_Japanese_Starter:]; +CL = [:LineBreak = Close_Punctuation:]; +CM = [:LineBreak = Combining_Mark:]; +CP = [:LineBreak = Close_Parenthesis:]; +CR = [:LineBreak = Carriage_Return:]; + +EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918]; +EM = [\U0001F3FB-\U0001F3FF]; + +EX = [:LineBreak = Exclamation:]; +GL = [:LineBreak = Glue:]; +HL = [:LineBreak = Hebrew_Letter:]; +HY = [:LineBreak = Hyphen:]; +H2 = [:LineBreak = H2:]; +H3 = [:LineBreak = H3:]; +ID = [[:LineBreak = Ideographic:] CJ [\u2764]]; +IN = [:LineBreak = Inseperable:]; +IS = [:LineBreak = Infix_Numeric:]; +JL = [:LineBreak = JL:]; +JV = [:LineBreak = JV:]; +JT = [:LineBreak = JT:]; +LF = [:LineBreak = Line_Feed:]; +NL = [:LineBreak = Next_Line:]; +NSX = [\u301C \u30A0]; +NS = [[:LineBreak = Nonstarter:] - NSX]; +NU = [:LineBreak = Numeric:]; +OP = [:LineBreak = Open_Punctuation:]; +PO = [:LineBreak = Postfix_Numeric:]; +PR = [:LineBreak = Prefix_Numeric:]; +QU = [:LineBreak = Quotation:]; +RI = [:LineBreak = Regional_Indicator:]; +SA = [:LineBreak = Complex_Context:]; +SG = [:LineBreak = Surrogate:]; +SP = [:LineBreak = Space:]; +SY = [:LineBreak = Break_Symbols:]; +WJ = [:LineBreak = Word_Joiner:]; +XX = [:LineBreak = Unknown:]; +ZW = [:LineBreak = ZWSpace:]; +ZJ = [\u200D]; + +# TODO: adjustment to sets needed only until Unicode properties are updated for Emoji. +ID = [ID - EB]; +AL = [AL - EM]; + +dictionary = [:LineBreak = Complex_Context:]; + +# Redfine AL. LB1. TODO: refine according to latest UAX. +AL = [ AL AI SA SG XX ]; + +LB4: BK ÷; +LB5: CR LF; +LB5.1: CR ÷; +LB5.2: LF ÷; +LB5.3: NL ÷; + +LB6: . (BK | CR | LF | NL); +LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL); + +# Rules LB14 - LB17. +# Moved before LB7, because they can match a longer sequence that would also match LB7, +# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it, +# "while only the prefix "OP CM SP" matches LB7.1 +LB14: OP CM* SP* .; +LB15: QU CM* SP* OP; + +# Do not break between closing punctuation and $NS, even with intervening spaces +# But DO allow a break between closing punctuation and $NSX, don't include it here +LB16: (CL | CP)CM* SP* NS; +LB17: B2 CM* SP* B2; + +LB7.1: [^ZW SP] CM* [SP ZW]; +LB7.2: [ZW SP] [SP ZW]; + +# LB8, ICU differs from UAX-14, +# ICU: ZW ÷; +# UAX 14: ZW SP* ÷; +LB8: ZW ÷; + +# LB8a, from Emoji proposal L2/16-011R3 +# ZWJ x ID +LB8a: ZJ (ID | EB | EM); + + +# LB9: X CM -> X +# LB10: Unattached CM -> AL + +#LB11: × WJ; +# WJ × + +LB11.1: [^BK CR LF NL SP ZW] CM* WJ; +LB11.2: SP WJ; +LB11.3: WJ CM* [^CM]; + +LB12: GL CM* [^CM]; + +LB12a: [^SP BA BAX HY] CM* GL; + +# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14. +# +# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule. +# LB13.2 SP CM* [CL CP EX IS SY] + +LB13.1: [^NU SP] CM* [CL CP IS SY]; +LB13.2: [^SP] CM* EX; +LB13.2: SP [CL CP EX IS SY]; + + +# LB 14-17 are moved above LB 7. + +LB18: SP ÷; + +LB19: . CM* QU; +LB19.1: QU CM* [^CM]; + +# LB 20 Break before and after CB. +# Interaction with LB8a: ZJ x ID is tricky because CM includes ZJ. +# ZJ acts like a CM to the left, combining with CB. +# ZJ acts independently to the right, no break from ID by LB8a. +LB20: . CM* ÷ CB; +LB20.1a: CB CM* ZJ (ID | EB | EM); +LB20.1b: CB CM* ÷; + +# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then +# not picking up the continuing match after the BA from 21a. +# TODO: For CJ tailorings (with BAX) does this rule want to include BAX? If so, +# should "HL BAX" not break when followed by a CB? Thats what the current +# rules do, which is why "[^CM CB]?" includes the ?. +LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?; + +# DO allow breaks here before $BAXcm and $NSXcm, so don't include them +LB21.1: . CM* [BA HY NS]; +LB21.2: BB CM* [^CM CB]; + +LB21b: SY CM* HL; + +LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL. +LB22.2: EX CM* IN; +LB22.3: (ID | EB | EM) CM* IN; +LB22.4: IN CM* IN; +LB22.5: NU CM* IN; + +LB23.1: (ID | EB | EM) CM* PO; +LB23.2: (AL | HL | CM) CM* NU; +LB23.3: NU CM* (AL | HL); + +LB24.1: PR CM* (ID | EB | EM); +LB24.2: PR CM* (AL | HL); +LB24.3: PO CM* (AL | HL); + +# Numbers. Equivalent to Tailoring example 8 from UAx 14. +LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?; + +LB26.1: JL CM* (JL | JV | H2 | H3); +LB26.2: (JV | H2) CM* (JV | JT); +LB26.3: (JT | H3) CM* JT; + +LB27.1: (JL | JV | JT | H2 | H3) CM* IN; +LB27.2: (JL | JV | JT | H2 | H3) CM* PO; +LB27.3: PR CM* (JL | JV | JT | H2 | H3); + +# LB28 Do not break between Alphabetics. +# Unattached (leading) CM treated as AL. +LB28: (AL | HL | CM)CM* (AL | HL); + +LB29: IS CM* (AL | HL); + +# LB30 is adjusted for unattached leading CM being treated as AL. +LB30.1: (AL | CM | HL | NU) CM* OP; +LB30.2: CP CM* (AL | HL | NU); + +# LB31 keep pairs of RI together. +LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS]; +LB30a.2: RI CM* RI CM* ZJ (ID | EB | EM); +LB30a.3: RI CM* RI CM* ÷; + +# LB30b Do not break between Emoji Base and Emoji Modifier +LB30b: EB CM* EM; + +# LB31 Break Everywhere Else. +# Include combining marks +LB31.1: . CM* ZJ (ID | EB | EM); +LB31.2: . CM* ÷; diff --git a/icu4c/source/test/testdata/break_rules/readme.txt b/icu4c/source/test/testdata/break_rules/readme.txt new file mode 100644 index 00000000000..0c63975b532 --- /dev/null +++ b/icu4c/source/test/testdata/break_rules/readme.txt @@ -0,0 +1,69 @@ +file: testdata/break_rules/readme.txt +Copyright (c) 2015, International Business Machines Corporation and others. All Rights Reserved. + +This directory contains the break iterator reference rule files used by intltest rbbi/RBBIMonkeyTest/testMonkey. +The rules in this directory track the boundary rules from Unicode UAX 14 and 29. They are interpretted +to provide an expected set of boundary positions to compare with the results from ICU break iteration. + +Each set of reference break rules lives in a separate file. +The list of rule files to run by default is hardcoded into the test code, in rbbimonkeytest.cpp. + +Each test file includes + - The type of ICU break interator to create (word, line, sentence, etc.) + - The locale to use + - Character Class definitions + - Rule definitions + +To Do + - Syntax for tailoring. + + +Character Class Definition: + name = set_regular_expression; + +Rule Definition: + rule_regular_expression; + +name: + [A-Za-z_][A-Za-z0-9_]* + +set_regular_expression: + The intersection of an ICU regular expression [set] expression and a UnicodeSet pattern. + (They are mostly the same) + May include previously defined set names, which are logically expanded in-place. + +rule_regular_expresson: + An ICU Regular Expression. + May include set names, which are logically expanded in-place. + May include a '÷', which defines a boundary position. + +Application of the rules: + Matching begins at the start of text, or after a previously identified boundary. + The pseudo-code below finds the next boundary. + + while position < end of text + for each rule + if the text at position matches this rule + if the rule has a '÷' + Boundary is found. + return the position of the '÷' within the match. + else + position = last character of the rule match. + break from the rule loop, continue the outer loop. + + This differs from the Unicode UAX algorithm in that each position in the text is + not tested separately. Instead, when a rule match is found, rule application restarts with the last + character of the preceding rule match. ICU's break rules also operate this way. + + Expressing rules this way simplifies UAX rules that have leading or trailing context; it + is no longer necessary to write expressions that match the context starting from + any position within it. + + This rule form differs from ICU rules in that the rules are applied sequentially, as they + are with the Unicode UAX rules. With the main ICU break rules, all are applied in parallel. + +Word Dictionaries + The monkey test does not test dictionary based breaking. The set named 'dicitionary' is special, + as it is in the main ICU rules. For the monkey test, no characters from the dictionary set are + included in the randomly-generated test data. + diff --git a/icu4c/source/test/testdata/break_rules/sentence.txt b/icu4c/source/test/testdata/break_rules/sentence.txt new file mode 100644 index 00000000000..0a58a9c962a --- /dev/null +++ b/icu4c/source/test/testdata/break_rules/sentence.txt @@ -0,0 +1,43 @@ +type = sentence; # one of grapheme | word | line | sentence +locale = en; + +CR = [\p{Sentence_Break = CR}]; +LF = [\p{Sentence_Break = LF}]; +Extend = [\p{Sentence_Break = Extend}]; +Sep = [\p{Sentence_Break = Sep}]; +Format = [\p{Sentence_Break = Format}]; +Sp = [\p{Sentence_Break = Sp}]; +Lower = [\p{Sentence_Break = Lower}]; +Upper = [\p{Sentence_Break = Upper}]; +OLetter = [\p{Sentence_Break = OLetter}]; +Numeric = [\p{Sentence_Break = Numeric}]; +ATerm = [\p{Sentence_Break = ATerm}]; +SContinue = [\p{Sentence_Break = SContinue}]; +STerm = [\p{Sentence_Break = STerm}]; +Close = [\p{Sentence_Break = Close}]; + +ParaSep = [Sep CR LF]; +SATerm = [STerm ATerm]; +ExtFmt = [Extend Format]; + +# SB2: ÷ eot +# Conventional regular expression matching for '$' as end-of-text also matches +# at a line separator just preceding the physical end of text. +# Instead, use a look-ahead assertion that there is no following character. +SB2: . ÷ (?!.); + +SB3: CR LF; +SB4: ParaSep ÷; + +# SB5: ignore Format and Extend characters. + +SB6: ATerm ExtFmt* Numeric; +SB7: (Upper | Lower) ExtFmt* ATerm ExtFmt* Upper; +SB8: ATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* ([^OLetter Upper Lower ParaSep SATerm ExtFmt] ExtFmt *)* Lower; +SB8a: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (SContinue | SATerm); + +SB9: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (CR LF | ParaSep)? ÷; + # Also covers SB10, SB11. + +SB12: . ExtFmt* [^ExtFmt]?; + diff --git a/icu4c/source/test/testdata/break_rules/word.txt b/icu4c/source/test/testdata/break_rules/word.txt new file mode 100644 index 00000000000..97d5b192bb9 --- /dev/null +++ b/icu4c/source/test/testdata/break_rules/word.txt @@ -0,0 +1,97 @@ +# +# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved. + +# file: word.txt +# +# Reference Word Break rules for intltest rbbi/RBBIMonkeyTest +# +# Note: Rule syntax and the monkey test itself are still a work in progress. +# They are expected to change with review and the addition of support for rule tailoring. + + +type = word; # one of grapheme | word | line | sentence +locale = en; + +E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918]; +E_Modifier = [\U0001F3FB-\U0001F3FF]; +ZWJ = [\u200D]; +GAZ = [\U0001F466-\U0001F469\U0001F48B\U0001F5E8\u2764]; + +CR = [\p{Word_Break = CR}]; +LF = [\p{Word_Break = LF}]; +Newline = [\p{Word_Break = Newline}]; +Extend = [[[\p{Word_Break = Extend}][:Block=Tags:]]-ZWJ]; +Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; +Format = [[\p{Word_Break = Format}]-[:Block=Tags:]]; +Katakana = [\p{Word_Break = Katakana}]; +Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; +ALetter = [\p{Word_Break = ALetter}]; +Single_Quote = [\p{Word_Break = Single_Quote}]; +Double_Quote = [\p{Word_Break = Double_Quote}]; +MidNumLet = [\p{Word_Break = MidNumLet}]; +MidLetter = [\p{Word_Break = MidLetter}]; +MidNum = [\p{Word_Break = MidNum}]; +Numeric = [\p{Word_Break = Numeric}]; +ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; + +#define dicitionary, with the effect being that those characters don't appear in test data. + +Han = [:Han:]; +Hiragana = [:Hiragana:]; + +Control = [\p{Grapheme_Cluster_Break = Control}]; +HangulSyllable = [\uac00-\ud7a3]; +ComplexContext = [:LineBreak = Complex_Context:]; +KanaKanji = [Han Hiragana Katakana]; +dictionaryCJK = [KanaKanji HangulSyllable]; +dictionary = [ComplexContext dictionaryCJK]; + +# leave CJK scripts out of ALetterPlus +# Tricky. Redfine a set. +# For tailorings, if it modifies itself, do at end of sets ???? +# Tweak redefine to mean replace existing definition at its original location. +# Insert defs without redefine just after last pre-existing def of that name. +# Maybe drop redefine, add warning for sets defined and not used, should catch typos. + +ALetter = [ALetter - dictionary]; + +AHLetter = [ALetter Hebrew_Letter]; +MidNumLetQ = [MidNumLet Single_Quote]; +ExtFmt = [Extend Format ZWJ]; + +WB3: CR LF; +WB3a: (Newline | CR | LF) ÷; +WB3b: . ÷ (Newline | CR | LF); # actually redundant? No other rule combines. + # (but needed with UAX treat-as scheme.) +WB3c: ZWJ GAZ; + +WB5: AHLetter ExtFmt* AHLetter; + +# includes both WB6 and WB7 +WB6: AHLetter ExtFmt* (MidLetter | MidNumLetQ) ExtFmt* AHLetter; + +WB7a: Hebrew_Letter ExtFmt* Single_Quote; +WB7b: Hebrew_Letter ExtFmt* Double_Quote ExtFmt* Hebrew_Letter; # Include WB7c + +WB8: Numeric ExtFmt* Numeric; +WB9: AHLetter ExtFmt* Numeric; +WB10: Numeric ExtFmt* AHLetter; + +WB11: Numeric ExtFmt* (MidNum | MidNumLetQ) ExtFmt* Numeric; # includes WB12 +WB13: Katakana ExtFmt* Katakana; + +WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) ExtFmt* ExtendNumLet; +WB13b: ExtendNumLet ExtFmt* (AHLetter | Numeric | Katakana); + +# WB rule 13c, pairs of Regional Indicators stay unbroken. +# Interacts with WB3c. +WB13c.1: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ GAZ; +WB13c.2: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ÷; + +WB13d: (E_Base | GAZ) ExtFmt* E_Modifier; + +# Rule WB 14 Any ÷ Any +# Interacts with WB3c, do not break between ZWJ and GAZ. +WB14.1: . ExtFmt* ZWJ GAZ; +WB14.2: . ExtFmt* ÷; + diff --git a/icu4c/source/test/testdata/break_rules/word_POSIX.txt b/icu4c/source/test/testdata/break_rules/word_POSIX.txt new file mode 100644 index 00000000000..e2d9dd7d776 --- /dev/null +++ b/icu4c/source/test/testdata/break_rules/word_POSIX.txt @@ -0,0 +1,96 @@ +# +# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved. + +# file: word_POSIX.txt +# +# Reference Word Break rules for intltest rbbi/RBBIMonkeyTest +# +# Note: Rule syntax and the monkey test itself are still a work in progress. +# They are expected to change with review and the addition of support for rule tailoring. + +type = word; # one of grapheme | word | line | sentence +locale = en_US_POSIX; + +E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918]; +E_Modifier = [\U0001F3FB-\U0001F3FF]; +ZWJ = [\u200D]; +GAZ = [\U0001F466-\U0001F469\U0001F48B\U0001F5E8\u2764]; + +CR = [\p{Word_Break = CR}]; +LF = [\p{Word_Break = LF}]; +Newline = [\p{Word_Break = Newline}]; +Extend = [[[\p{Word_Break = Extend}][:Block=Tags:]]-ZWJ]; +Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; +Format = [[\p{Word_Break = Format}]-[:Block=Tags:]]; +Katakana = [\p{Word_Break = Katakana}]; +Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; +ALetter = [\p{Word_Break = ALetter}]; +Single_Quote = [\p{Word_Break = Single_Quote}]; +Double_Quote = [\p{Word_Break = Double_Quote}]; +MidNumLet = [\p{Word_Break = MidNumLet} - [.]]; +MidLetter = [\p{Word_Break = MidLetter} - [\:]]; +MidNum = [\p{Word_Break = MidNum} [.]]; +Numeric = [\p{Word_Break = Numeric}]; +ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; + +#define dicitionary, with the effect being that those characters don't appear in test data. + +Han = [:Han:]; +Hiragana = [:Hiragana:]; + +Control = [\p{Grapheme_Cluster_Break = Control}]; +HangulSyllable = [\uac00-\ud7a3]; +ComplexContext = [:LineBreak = Complex_Context:]; +KanaKanji = [Han Hiragana Katakana]; +dictionaryCJK = [KanaKanji HangulSyllable]; +dictionary = [ComplexContext dictionaryCJK]; + +# leave CJK scripts out of ALetterPlus +# Tricky. Redfine a set. +# For tailorings, if it modifies itself, do at end of sets ???? +# Tweak redefine to mean replace existing definition at its original location. +# Insert defs without redefine just after last pre-existing def of that name. +# Maybe drop redefine, add warning for sets defined and not used, should catch typos. + +ALetter = [ALetter - dictionary]; + +AHLetter = [ALetter Hebrew_Letter]; +MidNumLetQ = [MidNumLet Single_Quote]; +ExtFmt = [Extend Format ZWJ]; + +WB3: CR LF; +WB3a: (Newline | CR | LF) ÷; +WB3b: . ÷ (Newline | CR | LF); # actually redundant? No other rule combines. + # (but needed with UAX treat-as scheme.) +WB3c: ZWJ GAZ; + +WB5: AHLetter ExtFmt* AHLetter; + +# includes both WB6 and WB7 +WB6: AHLetter ExtFmt* (MidLetter | MidNumLetQ) ExtFmt* AHLetter; + +WB7a: Hebrew_Letter ExtFmt* Single_Quote; +WB7b: Hebrew_Letter ExtFmt* Double_Quote ExtFmt* Hebrew_Letter; # Include WB7c + +WB8: Numeric ExtFmt* Numeric; +WB9: AHLetter ExtFmt* Numeric; +WB10: Numeric ExtFmt* AHLetter; + +WB11: Numeric ExtFmt* (MidNum | MidNumLetQ) ExtFmt* Numeric; # includes WB12 +WB13: Katakana ExtFmt* Katakana; + +WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) ExtFmt* ExtendNumLet; +WB13b: ExtendNumLet ExtFmt* (AHLetter | Numeric | Katakana); + +# WB rule 13c, pairs of Regional Indicators stay unbroken. +# Interacts with WB3c. +WB13c.1: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ GAZ; +WB13c.2: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ÷; + +WB13d: (E_Base | GAZ) ExtFmt* E_Modifier; + +# Rule WB 14 Any ÷ Any +# Interacts with WB3c, do not break between ZWJ and GAZ. +WB14.1: . ExtFmt* ZWJ GAZ; +WB14.2: . ExtFmt* ÷; + diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 52e2f06a650..d1a45514886 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2001-2015 International Business Machines +# Copyright (c) 2001-2016 International Business Machines # Corporation and others. All Rights Reserved. # # RBBI Test Data @@ -513,6 +513,18 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal • •\uF8FF\u2028<100>\uF8FF• • \u200B\u2028<100>\u200B• +# Regional Indicator sequences. They group in pairs. The reverse rules are tricky. +# Sequences are long enough that the non-exaustive monkey test won't reliably pick up problems. + +•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6• +•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6• + +•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6\u00a0\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6• +•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6\u00a0\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6• +•\U0001F1E6\U0001F1E6•\U0001F1E6\u00a0\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6• +•\U0001F1E6\U0001F1E6•\U0001F1E6\u00a0\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6• + + # User Guide example •Parlez-•vous •français ?• diff --git a/icu4c/source/tools/toolutil/toolutil.vcxproj b/icu4c/source/tools/toolutil/toolutil.vcxproj index 4a1d116f2c3..fd6c26508c3 100644 --- a/icu4c/source/tools/toolutil/toolutil.vcxproj +++ b/icu4c/source/tools/toolutil/toolutil.vcxproj @@ -271,7 +271,7 @@ false - + diff --git a/icu4c/source/tools/toolutil/ucbuf.c b/icu4c/source/tools/toolutil/ucbuf.cpp similarity index 99% rename from icu4c/source/tools/toolutil/ucbuf.c rename to icu4c/source/tools/toolutil/ucbuf.cpp index 93dbba133dc..d1c5c4de827 100644 --- a/icu4c/source/tools/toolutil/ucbuf.c +++ b/icu4c/source/tools/toolutil/ucbuf.cpp @@ -1,12 +1,12 @@ /* ******************************************************************************* * -* Copyright (C) 1998-2014, International Business Machines +* Copyright (C) 1998-2016, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * -* File ucbuf.c +* File ucbuf.cpp * * Modification History: * @@ -415,7 +415,7 @@ ucbuf_getcx32(UCHARBUF* buf,UErrorCode* error) { /* check if u_unescapeAt unescaped and converted * to c32 or not */ - if(c32==0xFFFFFFFF){ + if(c32==(UChar32)0xFFFFFFFF){ if(buf->showWarning) { char context[CONTEXT_LEN+1]; int32_t len = CONTEXT_LEN; diff --git a/icu4c/source/tools/toolutil/ucbuf.h b/icu4c/source/tools/toolutil/ucbuf.h index 37fc783dec8..4cfcee4d7ef 100644 --- a/icu4c/source/tools/toolutil/ucbuf.h +++ b/icu4c/source/tools/toolutil/ucbuf.h @@ -1,12 +1,12 @@ /* ******************************************************************************* * -* Copyright (C) 1998-2015, International Business Machines +* Copyright (C) 1998-2016, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * -* File ucbuf.c +* File ucbuf.h * * Modification History: * @@ -17,6 +17,7 @@ ******************************************************************************* */ +#include "unicode/localpointer.h" #include "unicode/ucnv.h" #include "filestrm.h" @@ -45,11 +46,11 @@ struct ULine { /** * Opens the UCHARBUF with the given file stream and code page for conversion * @param fileName Name of the file to open. - * @param codepage The encoding of the file stream to convert to Unicode. + * @param codepage The encoding of the file stream to convert to Unicode. * If *codepoge is NULL on input the API will try to autodetect * popular Unicode encodings * @param showWarning Flag to print out warnings to STDOUT - * @param buffered If TRUE performs a buffered read of the input file. If FALSE reads + * @param buffered If TRUE performs a buffered read of the input file. If FALSE reads * the whole file into memory and converts it. * @param err is a pointer to a valid UErrorCode value. If this value * indicates a failure on entry, the function will immediately return. @@ -82,7 +83,7 @@ U_CAPI int32_t U_EXPORT2 ucbuf_getc32(UCHARBUF* buf,UErrorCode* err); /** - * Gets a UTF-16 code unit at the current position from the converted buffer after + * Gets a UTF-16 code unit at the current position from the converted buffer after * unescaping and increments the current position. If the escape sequence is for UTF-32 * code point (\\Uxxxxxxxx) then a UTF-32 codepoint is returned * @param buf Pointer to UCHARBUF structure @@ -95,7 +96,7 @@ ucbuf_getcx32(UCHARBUF* buf,UErrorCode* err); /** * Gets a pointer to the current position in the internal buffer and length of the line. - * It imperative to make a copy of the returned buffere before performing operations on it. + * It imperative to make a copy of the returned buffer before performing operations on it. * @param buf Pointer to UCHARBUF structure * @param len Output param to receive the len of the buffer returned till end of the line * @param err is a pointer to a valid UErrorCode value. If this value @@ -141,6 +142,14 @@ ucbuf_close(UCHARBUF* buf); U_NAMESPACE_BEGIN +/** + * \class LocalUCHARBUFPointer + * "Smart pointer" class, closes a UCHARBUF via ucbuf_close(). + * For most methods see the LocalPointerBase base class. + * + * @see LocalPointerBase + * @see LocalPointer + */ U_DEFINE_LOCAL_OPEN_POINTER(LocalUCHARBUFPointer, UCHARBUF, ucbuf_close); U_NAMESPACE_END @@ -155,7 +164,7 @@ ucbuf_ungetc(int32_t ungetChar,UCHARBUF* buf); /** - * Autodetects the encoding of the file stream. Only Unicode charsets are autodectected. + * Autodetects the encoding of the file stream. Only Unicode charsets are autodectected. * Some Unicode charsets are stateful and need byte identifiers to be converted also to bring * the converter to correct state for converting the rest of the stream. So the UConverter parameter * is necessary. @@ -175,7 +184,7 @@ ucbuf_autodetect(const char* fileName, const char** cp,UConverter** conv, int32_t* signatureLength, UErrorCode* status); /** - * Autodetects the encoding of the file stream. Only Unicode charsets are autodectected. + * Autodetects the encoding of the file stream. Only Unicode charsets are autodectected. * Some Unicode charsets are stateful and need byte identifiers to be converted also to bring * the converter to correct state for converting the rest of the stream. So the UConverter parameter * is necessary.