From b552700cc621920edc968f67ffe28bb0fc430668 Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Sun, 28 Feb 2016 19:14:48 +0000 Subject: [PATCH] ICU-12081 RBBI extensions & Emoji rules. Import rule data to Java from C++, port code changes. X-SVN-Rev: 38422 --- .../core/src/com/ibm/icu/text/RBBINode.java | 37 +- .../com/ibm/icu/text/RBBIRuleParseTable.java | 243 +++---- .../src/com/ibm/icu/text/RBBIRuleScanner.java | 78 ++- .../com/ibm/icu/text/RBBITableBuilder.java | 115 ++-- .../ibm/icu/text/RuleBasedBreakIterator.java | 442 ++++++------ icu4j/main/shared/data/icudata.jar | 4 +- icu4j/main/shared/data/icutzdata.jar | 2 +- .../ibm/icu/dev/test/rbbi/RBBITestMonkey.java | 646 ++++++++++-------- 8 files changed, 865 insertions(+), 702 deletions(-) diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBINode.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBINode.java index 028e23b8386..cefbbd02166 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBINode.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBINode.java @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 2001-2010, International Business Machines Corporation and + * Copyright (c) 2001-2016, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ @@ -17,7 +17,7 @@ import com.ibm.icu.impl.Assert; */ class RBBINode { - + // enum NodeType { static final int setRef = 0; static final int uset = 1; @@ -36,7 +36,7 @@ class RBBINode { static final int opReverse = 14; static final int opLParen = 15; static final int nodeTypeLimit = 16; // For Assertion checking only. - + static final String [] nodeTypeNames = { "setRef", "uset", @@ -56,20 +56,20 @@ class RBBINode { "opLParen" }; -// enum OpPrecedence { +// enum OpPrecedence { static final int precZero = 0; static final int precStart = 1; static final int precLParen = 2; static final int precOpOr = 3; static final int precOpCat = 4; - + int fType; // enum NodeType RBBINode fParent; RBBINode fLeftChild; RBBINode fRightChild; UnicodeSet fInputSet; // For uset nodes only. int fPrecedence = precZero; // enum OpPrecedence, For binary ops only. - + String fText; // Text corresponding to this node. // May be lazily evaluated when (if) needed // for some node types. @@ -89,12 +89,17 @@ class RBBINode { // state transition table. boolean fLookAheadEnd; // For endMark nodes, set TRUE if - // marking the end of a look-ahead rule. + // marking the end of a look-ahead rule. + + boolean fRuleRoot; // True if this node is the root of a rule. + boolean fChainIn; // True if chaining into this rule is allowed + // (no '^' present). + Set fFirstPosSet; // See Aho DFA table generation algorithm - Set fLastPosSet; // See Aho. + Set fLastPosSet; // See Aho. Set fFollowPos; // See Aho. - + int fSerialNum; // Debugging aids. Each node gets a unique serial number. static int gLastSerial; @@ -129,6 +134,8 @@ class RBBINode { fLastPos = other.fLastPos; fNullable = other.fNullable; fVal = other.fVal; + fRuleRoot = false; + fChainIn = other.fChainIn; fFirstPosSet = new HashSet(other.fFirstPosSet); fLastPosSet = new HashSet(other.fLastPosSet); fFollowPos = new HashSet(other.fFollowPos); @@ -163,6 +170,8 @@ class RBBINode { n.fRightChild.fParent = n; } } + n.fRuleRoot = this.fRuleRoot; + n.fChainIn = this.fChainIn; return n; } @@ -259,8 +268,8 @@ class RBBINode { } } - - + + //------------------------------------------------------------------------- // // print. Print out a single node, for debugging. @@ -279,7 +288,7 @@ class RBBINode { RBBINode.printInt(n.fRightChild==null? 0 : n.fRightChild.fSerialNum, 12); RBBINode.printInt(n.fFirstPos, 12); RBBINode.printInt(n.fVal, 7); - + if (n.fType == varRef) { System.out.print(" " + n.fText); } @@ -287,7 +296,7 @@ class RBBINode { System.out.println(""); } ///CLOVER:ON - + // Print a String in a fixed field size. // Debugging function. @@ -344,7 +353,7 @@ class RBBINode { if (fLeftChild != null) { fLeftChild.printTree(false); } - + if (fRightChild != null) { fRightChild.printTree(false); } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleParseTable.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleParseTable.java index b2f78d6fc97..53cd225e283 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleParseTable.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleParseTable.java @@ -1,7 +1,7 @@ /* ******************************************************************************* - * Copyright (C) 2003-2010, International Business Machines Corporation and - * others. All Rights Reserved. + * Copyright (c) 2003-2016, International Business Machines + * Corporation and others. All Rights Reserved. ******************************************************************************* */ @@ -13,6 +13,8 @@ package com.ibm.icu.text; * rule parser. * It is generated by the Perl script "rbbicst.pl" from * the rule parser state definitions file "rbbirpt.txt". + * @internal + * */ class RBBIRuleParseTable { @@ -29,24 +31,25 @@ class RBBIRuleParseTable static final short doExprStart = 11; static final short doLParen = 12; static final short doNOP = 13; - static final short doOptionEnd = 14; - static final short doOptionStart = 15; - static final short doReverseDir = 16; - static final short doRuleChar = 17; - static final short doRuleError = 18; - static final short doRuleErrorAssignExpr = 19; - static final short doScanUnicodeSet = 20; - static final short doSlash = 21; - static final short doStartAssign = 22; - static final short doStartTagValue = 23; - static final short doStartVariableName = 24; - static final short doTagDigit = 25; - static final short doTagExpectedError = 26; - static final short doTagValue = 27; - static final short doUnaryOpPlus = 28; - static final short doUnaryOpQuestion = 29; - static final short doUnaryOpStar = 30; - static final short doVariableNameExpectedErr = 31; + static final short doNoChain = 14; + static final short doOptionEnd = 15; + static final short doOptionStart = 16; + static final short doReverseDir = 17; + static final short doRuleChar = 18; + static final short doRuleError = 19; + static final short doRuleErrorAssignExpr = 20; + static final short doScanUnicodeSet = 21; + static final short doSlash = 22; + static final short doStartAssign = 23; + static final short doStartTagValue = 24; + static final short doStartVariableName = 25; + static final short doTagDigit = 26; + static final short doTagExpectedError = 27; + static final short doTagValue = 28; + static final short doUnaryOpPlus = 29; + static final short doUnaryOpQuestion = 30; + static final short doUnaryOpStar = 31; + static final short doVariableNameExpectedErr = 32; static final short kRuleSet_default = 255; static final short kRuleSet_digit_char = 128; @@ -73,104 +76,112 @@ class RBBIRuleParseTable fNextChar = nc; fStateName = sn; } - } + }; static RBBIRuleTableElement[] gRuleParseStateTable = { new RBBIRuleTableElement(doNOP, 0, 0,0, true, null ) // 0 - , new RBBIRuleTableElement(doExprStart, 254, 21, 8, false, "start") // 1 + , new RBBIRuleTableElement(doExprStart, 254, 29, 9, false, "start") // 1 , new RBBIRuleTableElement(doNOP, 132, 1,0, true, null ) // 2 - , new RBBIRuleTableElement(doExprStart,'$', 80, 90, false, null ) // 3 - , new RBBIRuleTableElement(doNOP,'!', 11,0, true, null ) // 4 - , new RBBIRuleTableElement(doNOP,';', 1,0, true, null ) // 5 - , new RBBIRuleTableElement(doNOP, 252, 0,0, false, null ) // 6 - , new RBBIRuleTableElement(doExprStart, 255, 21, 8, false, null ) // 7 - , new RBBIRuleTableElement(doEndOfRule,';', 1,0, true, "break-rule-end") // 8 - , new RBBIRuleTableElement(doNOP, 132, 8,0, true, null ) // 9 - , new RBBIRuleTableElement(doRuleError, 255, 95,0, false, null ) // 10 - , new RBBIRuleTableElement(doNOP,'!', 13,0, true, "rev-option") // 11 - , new RBBIRuleTableElement(doReverseDir, 255, 20, 8, false, null ) // 12 - , new RBBIRuleTableElement(doOptionStart, 130, 15,0, true, "option-scan1") // 13 - , new RBBIRuleTableElement(doRuleError, 255, 95,0, false, null ) // 14 - , new RBBIRuleTableElement(doNOP, 129, 15,0, true, "option-scan2") // 15 - , new RBBIRuleTableElement(doOptionEnd, 255, 17,0, false, null ) // 16 - , new RBBIRuleTableElement(doNOP,';', 1,0, true, "option-scan3") // 17 - , new RBBIRuleTableElement(doNOP, 132, 17,0, true, null ) // 18 - , new RBBIRuleTableElement(doRuleError, 255, 95,0, false, null ) // 19 - , new RBBIRuleTableElement(doExprStart, 255, 21, 8, false, "reverse-rule") // 20 - , new RBBIRuleTableElement(doRuleChar, 254, 30,0, true, "term") // 21 - , new RBBIRuleTableElement(doNOP, 132, 21,0, true, null ) // 22 - , new RBBIRuleTableElement(doRuleChar, 131, 30,0, true, null ) // 23 - , new RBBIRuleTableElement(doNOP,'[', 86, 30, false, null ) // 24 - , new RBBIRuleTableElement(doLParen,'(', 21, 30, true, null ) // 25 - , new RBBIRuleTableElement(doNOP,'$', 80, 29, false, null ) // 26 - , new RBBIRuleTableElement(doDotAny,'.', 30,0, true, null ) // 27 - , new RBBIRuleTableElement(doRuleError, 255, 95,0, false, null ) // 28 - , new RBBIRuleTableElement(doCheckVarDef, 255, 30,0, false, "term-var-ref") // 29 - , new RBBIRuleTableElement(doNOP, 132, 30,0, true, "expr-mod") // 30 - , new RBBIRuleTableElement(doUnaryOpStar,'*', 35,0, true, null ) // 31 - , new RBBIRuleTableElement(doUnaryOpPlus,'+', 35,0, true, null ) // 32 - , new RBBIRuleTableElement(doUnaryOpQuestion,'?', 35,0, true, null ) // 33 - , new RBBIRuleTableElement(doNOP, 255, 35,0, false, null ) // 34 - , new RBBIRuleTableElement(doExprCatOperator, 254, 21,0, false, "expr-cont") // 35 - , new RBBIRuleTableElement(doNOP, 132, 35,0, true, null ) // 36 - , new RBBIRuleTableElement(doExprCatOperator, 131, 21,0, false, null ) // 37 - , new RBBIRuleTableElement(doExprCatOperator,'[', 21,0, false, null ) // 38 - , new RBBIRuleTableElement(doExprCatOperator,'(', 21,0, false, null ) // 39 - , new RBBIRuleTableElement(doExprCatOperator,'$', 21,0, false, null ) // 40 - , new RBBIRuleTableElement(doExprCatOperator,'.', 21,0, false, null ) // 41 - , new RBBIRuleTableElement(doExprCatOperator,'/', 47,0, false, null ) // 42 - , new RBBIRuleTableElement(doExprCatOperator,'{', 59,0, true, null ) // 43 - , new RBBIRuleTableElement(doExprOrOperator,'|', 21,0, true, null ) // 44 - , new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 45 - , new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 46 - , new RBBIRuleTableElement(doSlash,'/', 49,0, true, "look-ahead") // 47 - , new RBBIRuleTableElement(doNOP, 255, 95,0, false, null ) // 48 - , new RBBIRuleTableElement(doExprCatOperator, 254, 21,0, false, "expr-cont-no-slash") // 49 - , new RBBIRuleTableElement(doNOP, 132, 35,0, true, null ) // 50 - , new RBBIRuleTableElement(doExprCatOperator, 131, 21,0, false, null ) // 51 - , new RBBIRuleTableElement(doExprCatOperator,'[', 21,0, false, null ) // 52 - , new RBBIRuleTableElement(doExprCatOperator,'(', 21,0, false, null ) // 53 - , new RBBIRuleTableElement(doExprCatOperator,'$', 21,0, false, null ) // 54 - , new RBBIRuleTableElement(doExprCatOperator,'.', 21,0, false, null ) // 55 - , new RBBIRuleTableElement(doExprOrOperator,'|', 21,0, true, null ) // 56 - , new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 57 - , new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 58 - , new RBBIRuleTableElement(doNOP, 132, 59,0, true, "tag-open") // 59 - , new RBBIRuleTableElement(doStartTagValue, 128, 62,0, false, null ) // 60 - , new RBBIRuleTableElement(doTagExpectedError, 255, 95,0, false, null ) // 61 - , new RBBIRuleTableElement(doNOP, 132, 66,0, true, "tag-value") // 62 - , new RBBIRuleTableElement(doNOP,'}', 66,0, false, null ) // 63 - , new RBBIRuleTableElement(doTagDigit, 128, 62,0, true, null ) // 64 - , new RBBIRuleTableElement(doTagExpectedError, 255, 95,0, false, null ) // 65 - , new RBBIRuleTableElement(doNOP, 132, 66,0, true, "tag-close") // 66 - , new RBBIRuleTableElement(doTagValue,'}', 69,0, true, null ) // 67 - , new RBBIRuleTableElement(doTagExpectedError, 255, 95,0, false, null ) // 68 - , new RBBIRuleTableElement(doExprCatOperator, 254, 21,0, false, "expr-cont-no-tag") // 69 - , new RBBIRuleTableElement(doNOP, 132, 69,0, true, null ) // 70 - , new RBBIRuleTableElement(doExprCatOperator, 131, 21,0, false, null ) // 71 - , new RBBIRuleTableElement(doExprCatOperator,'[', 21,0, false, null ) // 72 - , new RBBIRuleTableElement(doExprCatOperator,'(', 21,0, false, null ) // 73 - , new RBBIRuleTableElement(doExprCatOperator,'$', 21,0, false, null ) // 74 - , new RBBIRuleTableElement(doExprCatOperator,'.', 21,0, false, null ) // 75 - , new RBBIRuleTableElement(doExprCatOperator,'/', 47,0, false, null ) // 76 - , new RBBIRuleTableElement(doExprOrOperator,'|', 21,0, true, null ) // 77 - , new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 78 - , new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 79 - , new RBBIRuleTableElement(doStartVariableName,'$', 82,0, true, "scan-var-name") // 80 - , new RBBIRuleTableElement(doNOP, 255, 95,0, false, null ) // 81 - , new RBBIRuleTableElement(doNOP, 130, 84,0, true, "scan-var-start") // 82 - , new RBBIRuleTableElement(doVariableNameExpectedErr, 255, 95,0, false, null ) // 83 - , new RBBIRuleTableElement(doNOP, 129, 84,0, true, "scan-var-body") // 84 - , new RBBIRuleTableElement(doEndVariableName, 255, 255,0, false, null ) // 85 - , new RBBIRuleTableElement(doScanUnicodeSet,'[', 255,0, true, "scan-unicode-set") // 86 - , new RBBIRuleTableElement(doScanUnicodeSet,'p', 255,0, true, null ) // 87 - , new RBBIRuleTableElement(doScanUnicodeSet,'P', 255,0, true, null ) // 88 - , new RBBIRuleTableElement(doNOP, 255, 95,0, false, null ) // 89 - , new RBBIRuleTableElement(doNOP, 132, 90,0, true, "assign-or-rule") // 90 - , new RBBIRuleTableElement(doStartAssign,'=', 21, 93, true, null ) // 91 - , new RBBIRuleTableElement(doNOP, 255, 29, 8, false, null ) // 92 - , new RBBIRuleTableElement(doEndAssign,';', 1,0, true, "assign-end") // 93 - , new RBBIRuleTableElement(doRuleErrorAssignExpr, 255, 95,0, false, null ) // 94 - , new RBBIRuleTableElement(doExit, 255, 95,0, true, "errorDeath") // 95 + , new RBBIRuleTableElement(doNoChain,'^', 12, 9, true, null ) // 3 + , new RBBIRuleTableElement(doExprStart,'$', 88, 98, false, null ) // 4 + , new RBBIRuleTableElement(doNOP,'!', 19,0, true, null ) // 5 + , new RBBIRuleTableElement(doNOP,';', 1,0, true, null ) // 6 + , new RBBIRuleTableElement(doNOP, 252, 0,0, false, null ) // 7 + , new RBBIRuleTableElement(doExprStart, 255, 29, 9, false, null ) // 8 + , new RBBIRuleTableElement(doEndOfRule,';', 1,0, true, "break-rule-end") // 9 + , new RBBIRuleTableElement(doNOP, 132, 9,0, true, null ) // 10 + , new RBBIRuleTableElement(doRuleError, 255, 103,0, false, null ) // 11 + , new RBBIRuleTableElement(doExprStart, 254, 29,0, false, "start-after-caret") // 12 + , new RBBIRuleTableElement(doNOP, 132, 12,0, true, null ) // 13 + , new RBBIRuleTableElement(doRuleError,'^', 103,0, false, null ) // 14 + , new RBBIRuleTableElement(doExprStart,'$', 88, 37, false, null ) // 15 + , new RBBIRuleTableElement(doRuleError,';', 103,0, false, null ) // 16 + , new RBBIRuleTableElement(doRuleError, 252, 103,0, false, null ) // 17 + , new RBBIRuleTableElement(doExprStart, 255, 29,0, false, null ) // 18 + , new RBBIRuleTableElement(doNOP,'!', 21,0, true, "rev-option") // 19 + , new RBBIRuleTableElement(doReverseDir, 255, 28, 9, false, null ) // 20 + , new RBBIRuleTableElement(doOptionStart, 130, 23,0, true, "option-scan1") // 21 + , new RBBIRuleTableElement(doRuleError, 255, 103,0, false, null ) // 22 + , new RBBIRuleTableElement(doNOP, 129, 23,0, true, "option-scan2") // 23 + , new RBBIRuleTableElement(doOptionEnd, 255, 25,0, false, null ) // 24 + , new RBBIRuleTableElement(doNOP,';', 1,0, true, "option-scan3") // 25 + , new RBBIRuleTableElement(doNOP, 132, 25,0, true, null ) // 26 + , new RBBIRuleTableElement(doRuleError, 255, 103,0, false, null ) // 27 + , new RBBIRuleTableElement(doExprStart, 255, 29, 9, false, "reverse-rule") // 28 + , new RBBIRuleTableElement(doRuleChar, 254, 38,0, true, "term") // 29 + , new RBBIRuleTableElement(doNOP, 132, 29,0, true, null ) // 30 + , new RBBIRuleTableElement(doRuleChar, 131, 38,0, true, null ) // 31 + , new RBBIRuleTableElement(doNOP,'[', 94, 38, false, null ) // 32 + , new RBBIRuleTableElement(doLParen,'(', 29, 38, true, null ) // 33 + , new RBBIRuleTableElement(doNOP,'$', 88, 37, false, null ) // 34 + , new RBBIRuleTableElement(doDotAny,'.', 38,0, true, null ) // 35 + , new RBBIRuleTableElement(doRuleError, 255, 103,0, false, null ) // 36 + , new RBBIRuleTableElement(doCheckVarDef, 255, 38,0, false, "term-var-ref") // 37 + , new RBBIRuleTableElement(doNOP, 132, 38,0, true, "expr-mod") // 38 + , new RBBIRuleTableElement(doUnaryOpStar,'*', 43,0, true, null ) // 39 + , new RBBIRuleTableElement(doUnaryOpPlus,'+', 43,0, true, null ) // 40 + , new RBBIRuleTableElement(doUnaryOpQuestion,'?', 43,0, true, null ) // 41 + , new RBBIRuleTableElement(doNOP, 255, 43,0, false, null ) // 42 + , new RBBIRuleTableElement(doExprCatOperator, 254, 29,0, false, "expr-cont") // 43 + , new RBBIRuleTableElement(doNOP, 132, 43,0, true, null ) // 44 + , new RBBIRuleTableElement(doExprCatOperator, 131, 29,0, false, null ) // 45 + , new RBBIRuleTableElement(doExprCatOperator,'[', 29,0, false, null ) // 46 + , new RBBIRuleTableElement(doExprCatOperator,'(', 29,0, false, null ) // 47 + , new RBBIRuleTableElement(doExprCatOperator,'$', 29,0, false, null ) // 48 + , new RBBIRuleTableElement(doExprCatOperator,'.', 29,0, false, null ) // 49 + , new RBBIRuleTableElement(doExprCatOperator,'/', 55,0, false, null ) // 50 + , new RBBIRuleTableElement(doExprCatOperator,'{', 67,0, true, null ) // 51 + , new RBBIRuleTableElement(doExprOrOperator,'|', 29,0, true, null ) // 52 + , new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 53 + , new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 54 + , new RBBIRuleTableElement(doSlash,'/', 57,0, true, "look-ahead") // 55 + , new RBBIRuleTableElement(doNOP, 255, 103,0, false, null ) // 56 + , new RBBIRuleTableElement(doExprCatOperator, 254, 29,0, false, "expr-cont-no-slash") // 57 + , new RBBIRuleTableElement(doNOP, 132, 43,0, true, null ) // 58 + , new RBBIRuleTableElement(doExprCatOperator, 131, 29,0, false, null ) // 59 + , new RBBIRuleTableElement(doExprCatOperator,'[', 29,0, false, null ) // 60 + , new RBBIRuleTableElement(doExprCatOperator,'(', 29,0, false, null ) // 61 + , new RBBIRuleTableElement(doExprCatOperator,'$', 29,0, false, null ) // 62 + , new RBBIRuleTableElement(doExprCatOperator,'.', 29,0, false, null ) // 63 + , new RBBIRuleTableElement(doExprOrOperator,'|', 29,0, true, null ) // 64 + , new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 65 + , new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 66 + , new RBBIRuleTableElement(doNOP, 132, 67,0, true, "tag-open") // 67 + , new RBBIRuleTableElement(doStartTagValue, 128, 70,0, false, null ) // 68 + , new RBBIRuleTableElement(doTagExpectedError, 255, 103,0, false, null ) // 69 + , new RBBIRuleTableElement(doNOP, 132, 74,0, true, "tag-value") // 70 + , new RBBIRuleTableElement(doNOP,'}', 74,0, false, null ) // 71 + , new RBBIRuleTableElement(doTagDigit, 128, 70,0, true, null ) // 72 + , new RBBIRuleTableElement(doTagExpectedError, 255, 103,0, false, null ) // 73 + , new RBBIRuleTableElement(doNOP, 132, 74,0, true, "tag-close") // 74 + , new RBBIRuleTableElement(doTagValue,'}', 77,0, true, null ) // 75 + , new RBBIRuleTableElement(doTagExpectedError, 255, 103,0, false, null ) // 76 + , new RBBIRuleTableElement(doExprCatOperator, 254, 29,0, false, "expr-cont-no-tag") // 77 + , new RBBIRuleTableElement(doNOP, 132, 77,0, true, null ) // 78 + , new RBBIRuleTableElement(doExprCatOperator, 131, 29,0, false, null ) // 79 + , new RBBIRuleTableElement(doExprCatOperator,'[', 29,0, false, null ) // 80 + , new RBBIRuleTableElement(doExprCatOperator,'(', 29,0, false, null ) // 81 + , new RBBIRuleTableElement(doExprCatOperator,'$', 29,0, false, null ) // 82 + , new RBBIRuleTableElement(doExprCatOperator,'.', 29,0, false, null ) // 83 + , new RBBIRuleTableElement(doExprCatOperator,'/', 55,0, false, null ) // 84 + , new RBBIRuleTableElement(doExprOrOperator,'|', 29,0, true, null ) // 85 + , new RBBIRuleTableElement(doExprRParen,')', 255,0, true, null ) // 86 + , new RBBIRuleTableElement(doExprFinished, 255, 255,0, false, null ) // 87 + , new RBBIRuleTableElement(doStartVariableName,'$', 90,0, true, "scan-var-name") // 88 + , new RBBIRuleTableElement(doNOP, 255, 103,0, false, null ) // 89 + , new RBBIRuleTableElement(doNOP, 130, 92,0, true, "scan-var-start") // 90 + , new RBBIRuleTableElement(doVariableNameExpectedErr, 255, 103,0, false, null ) // 91 + , new RBBIRuleTableElement(doNOP, 129, 92,0, true, "scan-var-body") // 92 + , new RBBIRuleTableElement(doEndVariableName, 255, 255,0, false, null ) // 93 + , new RBBIRuleTableElement(doScanUnicodeSet,'[', 255,0, true, "scan-unicode-set") // 94 + , new RBBIRuleTableElement(doScanUnicodeSet,'p', 255,0, true, null ) // 95 + , new RBBIRuleTableElement(doScanUnicodeSet,'P', 255,0, true, null ) // 96 + , new RBBIRuleTableElement(doNOP, 255, 103,0, false, null ) // 97 + , new RBBIRuleTableElement(doNOP, 132, 98,0, true, "assign-or-rule") // 98 + , new RBBIRuleTableElement(doStartAssign,'=', 29, 101, true, null ) // 99 + , new RBBIRuleTableElement(doNOP, 255, 37, 9, false, null ) // 100 + , new RBBIRuleTableElement(doEndAssign,';', 1,0, true, "assign-end") // 101 + , new RBBIRuleTableElement(doRuleErrorAssignExpr, 255, 103,0, false, null ) // 102 + , new RBBIRuleTableElement(doExit, 255, 103,0, true, "errorDeath") // 103 }; -} +}; diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java index 31fb8e1d017..a62b0d917b1 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java @@ -1,9 +1,9 @@ /* ******************************************************************************* - * Copyright (C) 2003-2011, International Business Machines Corporation and others. All Rights Reserved. + * Copyright (C) 2003-2016, International Business Machines Corporation and others. All Rights Reserved. ******************************************************************************* */ - + package com.ibm.icu.text; import java.text.ParsePosition; @@ -19,12 +19,12 @@ import com.ibm.icu.lang.UCharacter; * There is no public API here. */ class RBBIRuleScanner { - + private final static int kStackSize = 100; // The size of the state stack for // rules parsing. Corresponds roughly // to the depth of parentheses nesting // that is allowed in the rules. - + static class RBBIRuleChar { int fChar; boolean fEscaped; @@ -33,7 +33,7 @@ class RBBIRuleScanner { RBBIRuleBuilder fRB; // The rule builder that we are part of. - + int fScanIndex; // Index of current character being processed // in the rule input string. int fNextIndex; // Index of the next character, which @@ -43,49 +43,52 @@ class RBBIRuleScanner { int fCharNum; // Char position within the line. int fLastChar; // Previous char, needed to count CR-LF // as a single line, not two. - + RBBIRuleChar fC = new RBBIRuleChar(); // Current char for parse state machine // processing. String fVarName; // $variableName, valid when we've just // scanned one. - - + + short fStack[] = new short[kStackSize]; // State stack, holds state pushes int fStackPtr; // and pops as specified in the state // transition rules. - + RBBINode fNodeStack[] = new RBBINode[kStackSize]; // Node stack, holds nodes created // during the parse of a rule int fNodeStackPtr; - - - boolean fReverseRule; // True if the rule currently being scanned + + + boolean fReverseRule; // True if the rule currently being scanned // is a reverse direction rule (if it // starts with a '!') - - boolean fLookAheadRule; // True if the rule includes a '/' + + boolean fLookAheadRule; // True if the rule includes a '/' // somewhere within it. - - RBBISymbolTable fSymbolTable; // symbol table, holds definitions of + + boolean fNoChainInRule; // True if the current rule starts with a '^'. + + + RBBISymbolTable fSymbolTable; // symbol table, holds definitions of // $variable symbols. - + HashMap fSetTable = new HashMap(); // UnicocodeSet hash table, holds indexes to // the sets created while parsing rules. // The key is the string used for creating // the set. - + UnicodeSet fRuleSets[] = new UnicodeSet[10]; // Unicode Sets that are needed during // the scanning of RBBI rules. The // indicies for these are assigned by the // perl script that builds the state tables. // See rbbirpt.h. - + int fRuleNum; // Counts each rule as it is scanned. - + int fOptionStart; // Input index of start of a !!option // keyword, while being scanned. - + static private String gRuleSet_rule_char_pattern = "[^[\\p{Z}\\u0020-\\u007f]-[\\p{L}]-[\\p{N}]]"; static private String gRuleSet_name_char_pattern = "[_\\p{L}\\p{N}]"; @@ -94,8 +97,8 @@ class RBBIRuleScanner { static private String gRuleSet_white_space_pattern = "[\\p{Pattern_White_Space}]"; static private String kAny = "any"; - - + + //---------------------------------------------------------------------------------------- // @@ -139,6 +142,12 @@ class RBBIRuleScanner { fRuleNum++; break; + case RBBIRuleParseTable.doNoChain: + // Scanned a '^' while on the rule start state. + fNoChainInRule = true; + break; + + case RBBIRuleParseTable.doExprOrOperator: { fixOpStack(RBBINode.precOpCat); RBBINode operandNode = fNodeStack[fNodeStackPtr--]; @@ -241,11 +250,11 @@ class RBBIRuleScanner { printNodeStack("end of rule"); } Assert.assrt(fNodeStackPtr == 1); + RBBINode thisRule = fNodeStack[fNodeStackPtr]; // If this rule includes a look-ahead '/', add a endMark node to the // expression tree. if (fLookAheadRule) { - RBBINode thisRule = fNodeStack[fNodeStackPtr]; RBBINode endNode = pushNewNode(RBBINode.endMark); RBBINode catNode = pushNewNode(RBBINode.opCat); fNodeStackPtr -= 2; @@ -254,8 +263,24 @@ class RBBIRuleScanner { fNodeStack[fNodeStackPtr] = catNode; endNode.fVal = fRuleNum; endNode.fLookAheadEnd = true; + thisRule = catNode; + + // TODO: Disable chaining out of look-ahead (hard break) rules. + // The break on rule match is forced, so there is no point in building up + // the state table to chain into another rule for a longer match. } + // Mark this node as being the root of a rule. + thisRule.fRuleRoot = true; + + // Flag if chaining into this rule is wanted. + // + if (fRB.fChainRules && // If rule chaining is enabled globally via !!chain + !fNoChainInRule) { // and no '^' chain-in inhibit was on this rule + thisRule.fChainIn = true; + } + + // All rule expressions are ORed together. // The ';' that terminates an expression really just functions as a // '|' with @@ -269,12 +294,12 @@ class RBBIRuleScanner { int destRules = (fReverseRule ? RBBIRuleBuilder.fReverseTree : fRB.fDefaultTree); if (fRB.fTreeRoots[destRules] != null) { - // This is not the first rule encounted. + // This is not the first rule encountered. // OR previous stuff (from *destRules) // with the current rule expression (on the Node Stack) // with the resulting OR expression going to *destRules // - RBBINode thisRule = fNodeStack[fNodeStackPtr]; + thisRule = fNodeStack[fNodeStackPtr]; RBBINode prevRules = fRB.fTreeRoots[destRules]; RBBINode orNode = pushNewNode(RBBINode.opOr); orNode.fLeftChild = prevRules; @@ -289,6 +314,7 @@ class RBBIRuleScanner { } fReverseRule = false; // in preparation for the next rule. fLookAheadRule = false; + fNoChainInRule = false; fNodeStackPtr = 0; } break; diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java index 73c9c4c92f0..2140d5ed4f2 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (c) 2002-2009, International Business Machines +* Copyright (c) 2002-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -28,9 +28,9 @@ import com.ibm.icu.lang.UProperty; // There is no user-visible public API here. // class RBBITableBuilder { - - - + + + // // RBBIStateDescriptor - The DFA is initially constructed as a set of these descriptors, // one for each state. @@ -58,8 +58,8 @@ class RBBITableBuilder { // symbol. } } - - + + private RBBIRuleBuilder fRB; private int fRootIx; // The array index into RBBIRuleBuilder.fTreeRoots // for the parse tree to operate on. @@ -84,7 +84,7 @@ class RBBITableBuilder { - + //----------------------------------------------------------------------------- // // RBBITableBuilder::build - This is the main function for building the DFA state transtion @@ -109,11 +109,11 @@ class RBBITableBuilder { } // - // If the rules contained any references to {bof} + // If the rules contained any references to {bof} // add a {bof} to the - // tree. Means that all matches must start out with the + // tree. Means that all matches must start out with the // {bof} fake character. - // + // if (fRB.fSetBuilder.sawBOF()) { RBBINode bofTop = new RBBINode(RBBINode.opCat); RBBINode bofLeaf = new RBBINode(RBBINode.leafChar); @@ -361,6 +361,25 @@ class RBBITableBuilder { } } + //----------------------------------------------------------------------------- + // + // addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged + // as roots of a rule to a destination vector. + // + //----------------------------------------------------------------------------- + void addRuleRootNodes(List dest, RBBINode node) { + if (node == null) { + return; + } + if (node.fRuleRoot) { + dest.add(node); + // Note: rules cannot nest. If we found a rule start node, + // no child node can also be a start node. + return; + } + addRuleRootNodes(dest, node.fLeftChild); + addRuleRootNodes(dest, node.fRightChild); + } //----------------------------------------------------------------------------- // @@ -379,17 +398,21 @@ class RBBITableBuilder { // get a list all leaf nodes tree.findNodes(leafNodes, RBBINode.leafChar); - // Get all nodes that can be the start a match, which is FirstPosition() - // of the portion of the tree corresponding to user-written rules. - // See the tree description in bofFixup(). - RBBINode userRuleRoot = tree; - if (fRB.fSetBuilder.sawBOF()) { - userRuleRoot = tree.fLeftChild.fRightChild; + // Collect all leaf nodes that can start matches for rules + // with inbound chaining enabled, which is the union of the + // firstPosition sets from each of the rule root nodes. + + List ruleRootNodes = new ArrayList(); + addRuleRootNodes(ruleRootNodes, tree); + + Set matchStartNodes = new HashSet(); + for (RBBINode node: ruleRootNodes) { + if (node.fChainIn) { + matchStartNodes.addAll(node.fFirstPosSet); + } } - Assert.assrt(userRuleRoot != null); - Set matchStartNodes = userRuleRoot.fFirstPosSet; - // Iteratate over all leaf nodes, + // Iterate over all leaf nodes, // for (RBBINode tNode : leafNodes) { RBBINode endNode = null; @@ -461,9 +484,9 @@ class RBBITableBuilder { // // The parse tree looks like this ... // fTree root --. - // / \ + // / \ // <#end node> - // / \ + // / \ // rest // of tree // @@ -477,7 +500,7 @@ class RBBITableBuilder { // (excluding the fake bofNode) // We want the nodes that can start a match in the // part labeled "rest of tree" - // + // Set matchStartNodes = fRB.fTreeRoots[fRootIx].fLeftChild.fRightChild.fFirstPosSet; for (RBBINode startNode : matchStartNodes) { if (startNode.fType != RBBINode.leafChar) { @@ -489,7 +512,7 @@ class RBBITableBuilder { // explicitly written into a rule. // Add everything from the followPos set of this node to the // followPos set of the fake bofNode at the start of the tree. - // + // bofNode.fFollowPos.addAll(startNode.fFollowPos); } } @@ -705,7 +728,7 @@ class RBBITableBuilder { // The RBBI runtime uses an array of {sets of status values} that can // be returned for boundaries. Each accepting state that has non-zero // status includes an index into this array. The format of the array - // is + // is // Num of status values in group 1 // status val // status val @@ -718,7 +741,7 @@ class RBBITableBuilder { // // //----------------------------------------------------------------------------- - + void mergeRuleStatusVals() { // // The basic outline of what happens here is this... @@ -731,14 +754,14 @@ class RBBITableBuilder { // add the tag list for this state to the global list. // int n; - + // Pre-load a single tag of {0} into the table. // We will need this as a default, for rule sets with no explicit tagging, // or with explicit tagging of {0}. if (fRB.fRuleStatusVals.size() == 0) { fRB.fRuleStatusVals.add(Integer.valueOf(1)); // Num of statuses in group fRB.fRuleStatusVals.add(Integer.valueOf(0)); // and our single status of zero - + SortedSet s0 = new TreeSet(); Integer izero = Integer.valueOf(0); fRB.fStatusSets.put(s0, izero); @@ -756,17 +779,17 @@ class RBBITableBuilder { if (arrayIndexI == null) { // This is the first encounter of this set of status values. // Add them to the statusSets map, This map associates - // the set of status values with an index in the runtime status + // the set of status values with an index in the runtime status // values array. arrayIndexI = Integer.valueOf(fRB.fRuleStatusVals.size()); fRB.fStatusSets.put(statusVals, arrayIndexI); - + // Add the new set of status values to the vector of values that // will eventually become the array used by the runtime engine. fRB.fRuleStatusVals.add(Integer.valueOf(statusVals.size())); fRB.fRuleStatusVals.addAll(statusVals); } - + // Save the runtime array index back into the state descriptor. sd.fTagsIdx = arrayIndexI.intValue(); } @@ -784,7 +807,7 @@ class RBBITableBuilder { // for each node in the tree. // //----------------------------------------------------------------------------- - + void printPosSets(RBBINode n) { if (n==null) { return; @@ -804,7 +827,7 @@ class RBBITableBuilder { printPosSets(n.fLeftChild); printPosSets(n.fRightChild); } - + @@ -860,7 +883,7 @@ class RBBITableBuilder { // See struct RBBIStateTable in ICU4C, common/rbbidata.h // //----------------------------------------------------------------------------- - + short [] exportTable() { int state; int col; @@ -870,18 +893,18 @@ class RBBITableBuilder { } Assert.assrt(fRB.fSetBuilder.getNumCharCategories() < 0x7fff && - fDStates.size() < 0x7fff); + fDStates.size() < 0x7fff); int numStates = fDStates.size(); - + // Size of table size in shorts. // the "4" is the size of struct RBBIStateTableRow, the row header part only. int rowLen = 4 + fRB.fSetBuilder.getNumCharCategories(); int tableSize = getTableSize() / 2; - + short [] table = new short[tableSize]; - + // // Fill in the header fields. // Annoying because they really want to be ints, not shorts. @@ -893,7 +916,7 @@ class RBBITableBuilder { // RBBIStateTable.fRowLen table[RBBIDataWrapper.ROWLEN] = (short)(rowLen >>> 16); table[RBBIDataWrapper.ROWLEN+1] = (short)(rowLen & 0x0000ffff); - + // RBBIStateTable.fFlags int flags = 0; if (fRB.fLookAheadHardBreak) { @@ -904,7 +927,7 @@ class RBBITableBuilder { } table[RBBIDataWrapper.FLAGS] = (short)(flags >>> 16); table[RBBIDataWrapper.FLAGS+1] = (short)(flags & 0x0000ffff); - + int numCharCategories = fRB.fSetBuilder.getNumCharCategories(); for (state=0; state s) { for (RBBINode n : s) { RBBINode.printInt(n.fSerialNum, 8); } System.out.println(); } - + //----------------------------------------------------------------------------- @@ -943,7 +966,7 @@ class RBBITableBuilder { // printStates Debug Function. Dump the fully constructed state transition table. // //----------------------------------------------------------------------------- - + void printStates() { int c; // input "character" int n; // state number @@ -964,7 +987,7 @@ class RBBITableBuilder { RBBIStateDescriptor sd = fDStates.get(n); RBBINode.printInt(n, 5); System.out.print(" | "); - + RBBINode.printInt(sd.fAccepting, 3); RBBINode.printInt(sd.fLookAhead, 4); RBBINode.printInt(sd.fTagsIdx, 6); @@ -976,7 +999,7 @@ class RBBITableBuilder { } System.out.print("\n\n"); } - + @@ -985,7 +1008,7 @@ class RBBITableBuilder { // printRuleStatusTable Debug Function. Dump the common rule status table // //----------------------------------------------------------------------------- - + void printRuleStatusTable() { int thisRecord = 0; int nextRecord = 0; @@ -1007,7 +1030,7 @@ class RBBITableBuilder { } System.out.print("\n\n"); } - + } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java index 644d788828f..7d5f4611ef8 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java @@ -30,17 +30,17 @@ import com.ibm.icu.lang.UProperty; import com.ibm.icu.lang.UScript; /** - * Rule Based Break Iterator + * Rule Based Break Iterator * This is a port of the C++ class RuleBasedBreakIterator from ICU4C. - * + * * @stable ICU 2.0 */ public class RuleBasedBreakIterator extends BreakIterator { //======================================================================= // Constructors & Factories //======================================================================= - - /** + + /** * private constructor */ private RuleBasedBreakIterator() { @@ -51,14 +51,14 @@ public class RuleBasedBreakIterator extends BreakIterator { /** * Create a break iterator from a precompiled set of break rules. - * + * * Creating a break iterator from the binary rules is much faster than - * creating one from source rules. - * + * creating one from source rules. + * * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. * Binary break iterator rules are not guaranteed to be compatible between * different versions of ICU. - * + * * @param is an input stream supplying the compiled binary rules. * @throws IOException if there is an error while reading the rules from the InputStream. * @see #compileRules(String, OutputStream) @@ -67,7 +67,7 @@ public class RuleBasedBreakIterator extends BreakIterator { public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException { RuleBasedBreakIterator This = new RuleBasedBreakIterator(); This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is)); - return This; + return This; } /** @@ -129,7 +129,7 @@ public class RuleBasedBreakIterator extends BreakIterator { { RuleBasedBreakIterator result = (RuleBasedBreakIterator)super.clone(); if (fText != null) { - result.fText = (CharacterIterator)(fText.clone()); + result.fText = (CharacterIterator)(fText.clone()); } return result; } @@ -151,15 +151,15 @@ public class RuleBasedBreakIterator extends BreakIterator { if (fRData != other.fRData && (fRData == null || other.fRData == null)) { return false; } - if (fRData != null && other.fRData != null && + if (fRData != null && other.fRData != null && (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) { return false; } if (fText == null && other.fText == null) { - return true; + return true; } if (fText == null || other.fText == null) { - return false; + return false; } return fText.equals(other.fText); } @@ -188,13 +188,13 @@ public class RuleBasedBreakIterator extends BreakIterator { */ public int hashCode() { - return fRData.fRuleSource.hashCode(); + return fRData.fRuleSource.hashCode(); } private static final int START_STATE = 1; // The state number of the starting state private static final int STOP_STATE = 0; // The state-transition value indicating "stop" - + // RBBIRunMode - the state machine runs an extra iteration at the beginning and end // of user text. A variable with this enum type keeps track of where we // are. The state machine only fetches user text input while in RUN mode. @@ -206,14 +206,14 @@ public class RuleBasedBreakIterator extends BreakIterator { * The character iterator through which this BreakIterator accesses the text. */ private CharacterIterator fText = new java.text.StringCharacterIterator(""); - + /** * The rule data for this BreakIterator instance. Package private. */ RBBIDataWrapper fRData; - + /* - * Index of the Rule {tag} values for the most recent match. + * Index of the Rule {tag} values for the most recent match. */ private int fLastRuleStatusIndex; @@ -245,18 +245,18 @@ public class RuleBasedBreakIterator extends BreakIterator { && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0; /** - * What kind of break iterator this is. Set to KIND_LINE by default, + * What kind of break iterator this is. Set to KIND_LINE by default, * since this produces sensible output. */ private int fBreakType = KIND_LINE; - + /** * The "default" break engine - just skips over ranges of dictionary words, * producing no breaks. Should only be used if characters need to be handled * by a dictionary but we have no dictionary implementation for them. */ private final UnhandledBreakEngine fUnhandledBreakEngine = new UnhandledBreakEngine(); - + /** * when a range of characters is divided up using the dictionary, the break * positions that are discovered are stored here, preventing us from having @@ -271,8 +271,8 @@ public class RuleBasedBreakIterator extends BreakIterator { */ private int fPositionInCache; - - private final ConcurrentHashMap fBreakEngines = + + private final ConcurrentHashMap fBreakEngines = new ConcurrentHashMap(); /** * Dumps caches and performs other actions associated with a complete change @@ -293,18 +293,18 @@ public class RuleBasedBreakIterator extends BreakIterator { */ @Deprecated public void dump() { - this.fRData.dump(); + this.fRData.dump(); } /** * Compile a set of source break rules into the binary state tables used * by the break iterator engine. Creating a break iterator from precompiled * rules is much faster than creating one from source rules. - * + * * Binary break rules are not guaranteed to be compatible between different * versions of ICU. - * - * + * + * * @param rules The source form of the break rules * @param ruleBinary An output stream to receive the compiled rules. * @throws IOException If there is an error writing the output. @@ -314,7 +314,7 @@ public class RuleBasedBreakIterator extends BreakIterator { public static void compileRules(String rules, OutputStream ruleBinary) throws IOException { RBBIRuleBuilder.compileRules(rules, ruleBinary); } - + //======================================================================= // BreakIterator overrides //======================================================================= @@ -337,7 +337,7 @@ public class RuleBasedBreakIterator extends BreakIterator { fText.first(); return fText.getIndex(); } - + /** * Sets the current iteration position to the end of the text. * (i.e., the CharacterIterator's ending offset). @@ -364,7 +364,7 @@ public class RuleBasedBreakIterator extends BreakIterator { fText.setIndex(pos); return pos; } - + /** * Advances the iterator either forward or backward the specified number of steps. * Negative values move backward, and positive values move forward. This is @@ -387,7 +387,7 @@ public class RuleBasedBreakIterator extends BreakIterator { } return result; } - + /** * Advances the iterator to the next boundary position. * @return The position of the first boundary after this one. @@ -424,11 +424,11 @@ public class RuleBasedBreakIterator extends BreakIterator { * process. */ private int checkDictionary(int startPos, int endPos, boolean reverse) { - + // Reset the old break cache first. reset(); - // note: code segment below assumes that dictionary chars are in the + // note: code segment below assumes that dictionary chars are in the // startPos-endPos range // value returned should be next character in sequence if ((endPos - startPos) <= 1) { @@ -465,7 +465,7 @@ public class RuleBasedBreakIterator extends BreakIterator { c = CharacterIteration.current32(fText); category = (short)fRData.fTrie.getCodePointValue(c); } while (c != CharacterIteration.DONE32 && ((category & 0x4000)) != 0); - + // Back up to the last dictionary character rangeEnd = fText.getIndex(); if (c == CharacterIteration.DONE32) { @@ -497,7 +497,7 @@ public class RuleBasedBreakIterator extends BreakIterator { category = (short)fRData.fTrie.getCodePointValue(c); } - + // Loop through the text, looking for ranges of dictionary characters. // For each span, find the appropriate break engine, and ask it to find // any breaks within the span. @@ -518,11 +518,11 @@ public class RuleBasedBreakIterator extends BreakIterator { if (current >= rangeEnd) { break; } - + // We now have a dictionary character. Get the appropriate language object // to deal with it. lbe = getLanguageBreakEngine(c); - + // Ask the language object if there are any breaks. It will leave the text // pointer on the other side of its range, ready to search for the next one. if (lbe != null) { @@ -530,12 +530,12 @@ public class RuleBasedBreakIterator extends BreakIterator { foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, false, fBreakType, breaks); assert fText.getIndex() > startingIdx; } - + // Reload the loop variables for the next go-round c = CharacterIteration.current32(fText); category = (short)fRData.fTrie.getCodePointValue(c); } - + // If we found breaks, build a new break cache. The first and last entries must // be the original starting and ending position. if (foundBreakCount > 0) { @@ -549,15 +549,15 @@ public class RuleBasedBreakIterator extends BreakIterator { if (endPos > breaks.peek()) { breaks.push(endPos); } - + // TODO: get rid of this array, use results from the deque directly fCachedBreakPositions = new int[breaks.size()]; - + int i = 0; while (breaks.size() > 0) { fCachedBreakPositions[i++] = breaks.pollLast(); } - + // If there are breaks, then by definition, we are replacing the original // proposed break by one of the breaks we found. Use following() and // preceding() to do the work. They should never recurse in this case. @@ -573,10 +573,10 @@ public class RuleBasedBreakIterator extends BreakIterator { // to the original proposed break. fText.setIndex(reverse ? startPos : endPos); return (reverse ? startPos : endPos); - + } - - + + /** * Moves the iterator backwards, to the last boundary preceding this one. * @return The position of the last boundary position preceding this one. @@ -585,7 +585,7 @@ public class RuleBasedBreakIterator extends BreakIterator { public int previous() { int result; int startPos; - + CharacterIterator text = getText(); fLastStatusIndexValid = false; @@ -705,7 +705,7 @@ public class RuleBasedBreakIterator extends BreakIterator { return text.getIndex(); } } - + private int rulesFollowing(int offset) { // if the offset passed in is already past the end of the text, // just return DONE; if it's before the beginning, return the @@ -744,7 +744,7 @@ public class RuleBasedBreakIterator extends BreakIterator { } if (fRData.fSFTable != null) { // No Safe point reverse table, but there is a safe pt forward table. - // + // fText.setIndex(offset); previous32(fText); // handle next will give result >= offset @@ -820,7 +820,7 @@ public class RuleBasedBreakIterator extends BreakIterator { return text.getIndex(); } } - + private int rulesPreceding(int offset) { // if the offset passed in is already past the end of the text, // just return DONE; if it's before the beginning, return the @@ -1002,7 +1002,7 @@ public class RuleBasedBreakIterator extends BreakIterator { } /** - * Get the status (tag) values from the break rule(s) that determined the most + * Get the status (tag) values from the break rule(s) that determined the most * recently returned break position. The values appear in the rule source * within brackets, {123}, for example. The default status value for rules * that do not explicitly provide one is zero. @@ -1014,8 +1014,8 @@ public class RuleBasedBreakIterator extends BreakIterator { * the output will be truncated to the available length. No exception * will be thrown. * - * @param fillInArray an array to be filled in with the status values. - * @return The number of rule status values from rules that determined + * @param fillInArray an array to be filled in with the status values. + * @return The number of rule status values from rules that determined * the most recent boundary returned by the break iterator. * In the event that the array is too small, the return value * is the total number of status values that were available, @@ -1026,7 +1026,7 @@ public class RuleBasedBreakIterator extends BreakIterator { public int getRuleStatusVec(int[] fillInArray) { makeRuleStatusValid(); int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex]; - if (fillInArray != null) { + if (fillInArray != null) { int numToCopy = Math.min(numStatusVals, fillInArray.length); for (int i=0; i engine. script = UScript.HAN; } - + LanguageBreakEngine eng = fBreakEngines.get(script); /* if (eng != null && !eng.handles(c, fBreakType)) { @@ -1158,15 +1158,60 @@ public class RuleBasedBreakIterator extends BreakIterator { return eng; } - + private static final int kMaxLookaheads = 8; + private static class LookAheadResults { + int fUsedSlotLimit; + int[] fPositions; + int[] fKeys; + + LookAheadResults() { + fUsedSlotLimit= 0; + fPositions = new int[kMaxLookaheads]; + fKeys = new int[kMaxLookaheads]; + } + + int getPosition(int key) { + for (int i=0; i= kMaxLookaheads) { + assert(false); + i = kMaxLookaheads - 1; + } + fKeys[i] = key; + fPositions[i] = position; + assert(fUsedSlotLimit == i); + fUsedSlotLimit = i + 1; + } + + void reset() { + fUsedSlotLimit = 0; + } + }; + private LookAheadResults fLookAheadMatches = new LookAheadResults(); + /** * The State Machine Engine for moving forward is here. * This function is the heart of the RBBI run time engine. - * + * * @param stateTable * @return the new iterator position - * + * * A note on supplementary characters and the position of underlying * Java CharacterIterator: Normally, a character iterator is positioned at * the char most recently returned by next(). Within this function, when @@ -1201,7 +1246,7 @@ public class RuleBasedBreakIterator extends BreakIterator { // Set the initial state for the state machine int state = START_STATE; - int row = fRData.getRowIndex(state); + int row = fRData.getRowIndex(state); short category = 3; int flagsState = fRData.getStateTableFlags(stateTable); int mode = RBBI_RUN; @@ -1209,14 +1254,12 @@ public class RuleBasedBreakIterator extends BreakIterator { category = 2; mode = RBBI_START; if (TRACE) { - System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); + System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); System.out.print(RBBIDataWrapper.intToHexString(c, 10)); System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); } } - int lookaheadStatus = 0; - int lookaheadTagIdx = 0; - int lookaheadResult = 0; + fLookAheadMatches.reset(); // loop until we reach the end of the text or transition to state 0 while (state != STOP_STATE) { @@ -1226,16 +1269,6 @@ public class RuleBasedBreakIterator extends BreakIterator { // We have already run the loop one last time with the // character set to the pseudo {eof} value. Now it is time // to unconditionally bail out. - - if (lookaheadResult > result) { - // We ran off the end of the string with a pending - // look-ahead match. - // Treat this as if the look-ahead condition had been - // met, and return - // the match at the / position from the look-ahead rule. - result = lookaheadResult; - fLastRuleStatusIndex = lookaheadTagIdx; - } break; } // Run the loop one last time with the fake end-of-input character category @@ -1252,7 +1285,7 @@ public class RuleBasedBreakIterator extends BreakIterator { // which column in the state table to look at. // category = (short) trie.getCodePointValue(c); - + // Check the dictionary bit in the character's category. // Counter is only used by dictionary based iterators (subclasses). // Chars that need to be handled by a dictionary have a flag bit set @@ -1265,15 +1298,15 @@ public class RuleBasedBreakIterator extends BreakIterator { } if (TRACE) { - System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); + System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); System.out.print(RBBIDataWrapper.intToHexString(c, 10)); System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); } - // Advance to the next character. + // Advance to the next character. // If this is a beginning-of-input loop iteration, don't advance. // The next iteration will be processing the first real input character. - c = (int)text.next(); + c = (int)text.next(); if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { c = nextTrail32(text, c); } @@ -1284,7 +1317,7 @@ public class RuleBasedBreakIterator extends BreakIterator { // look up a state transition in the state table state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; - row = fRData.getRowIndex(state); + row = fRData.getRowIndex(state); if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) { // Match found, common case @@ -1299,40 +1332,30 @@ public class RuleBasedBreakIterator extends BreakIterator { fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX]; } - if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) { - if (lookaheadStatus != 0 - && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) { - // Lookahead match is completed. Set the result accordingly, but only - // if no other rule has matched further in the mean time. - result = lookaheadResult; - fLastRuleStatusIndex = lookaheadTagIdx; - lookaheadStatus = 0; - // TODO: make a standalone hard break in a rule work. - if ((flagsState & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0) { - text.setIndex(result); - return result; - } - // Look-ahead completed, but other rules may match further. Continue on. - // TODO: junk this feature? I don't think it's used anywhere. - continue; + int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING]; + if (completedRule > 0) { + // Lookahead match is completed + int lookaheadResult = fLookAheadMatches.getPosition(completedRule); + if (lookaheadResult >= 0) { + fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX]; + text.setIndex(lookaheadResult); + return lookaheadResult; } + } - lookaheadResult = text.getIndex(); + int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; + if (rule != 0) { + // At the position of a '/' in a look-ahead match. Record it. + int pos = text.getIndex(); if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { // The iterator has been left in the middle of a surrogate pair. // We want the beginning of it. - lookaheadResult--; + pos--; } - lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; - lookaheadTagIdx = stateTable[row + RBBIDataWrapper.TAGIDX]; - continue; + fLookAheadMatches.setPosition(rule, pos); } - if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) { - // Because this is an accepting state, any in-progress look-ahead match - // is no longer relevant. Clear out the pending lookahead status. - lookaheadStatus = 0; - } + } // End of state machine main loop // The state machine is done. Check whether it found a match... @@ -1340,7 +1363,7 @@ public class RuleBasedBreakIterator extends BreakIterator { // If the iterator failed to advance in the match engine force it ahead by one. // This indicates a defect in the break rules, which should always match // at least one character. - + if (result == initialPosition) { if (TRACE) { System.out.println("Iterator did not move. Advancing by 1."); @@ -1365,31 +1388,28 @@ public class RuleBasedBreakIterator extends BreakIterator { if (fText == null || stateTable == null) { return 0; } - + int state; int category = 0; int mode; - int row; + int row; int c; - int lookaheadStatus = 0; int result = 0; int initialPosition = 0; - int lookaheadResult = 0; - boolean lookAheadHardBreak = - (fRData.getStateTableFlags(stateTable) & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0; - + fLookAheadMatches.reset(); + // handlePrevious() never gets the rule status. // Flag the status as invalid; if the user ever asks for status, we will need // to back up, then re-find the break position using handleNext(), which does // get the status value. fLastStatusIndexValid = false; fLastRuleStatusIndex = 0; - + // set up the starting char initialPosition = fText.getIndex(); result = initialPosition; c = previous32(fText); - + // Set up the initial state for the state machine state = START_STATE; row = fRData.getRowIndex(state); @@ -1399,129 +1419,95 @@ public class RuleBasedBreakIterator extends BreakIterator { category = 2; mode = RBBI_START; } - + if (TRACE) { System.out.println("Handle Prev pos char state category "); } - + // loop until we reach the beginning of the text or transition to state 0 // mainLoop: for (;;) { - innerBlock: { - if (c == DONE32) { - // Reached end of input string. - if (mode == RBBI_END || fRData.fHeader.fVersion == 1) { - // Either this is the old (ICU 3.2 and earlier) format data which - // does not support explicit support for matching {eof}, or - // we have already done the {eof} iteration. Now is the time - // to unconditionally bail out. - if (lookaheadResult < result) { - // We ran off the end of the string with a pending look-ahead match. - // Treat this as if the look-ahead condition had been met, and return - // the match at the / position from the look-ahead rule. - result = lookaheadResult; - lookaheadStatus = 0; - } else if (result == initialPosition) { - // Ran off start, no match found. - // Move one position (towards the start, since we are doing previous.) - fText.setIndex(initialPosition); - previous32(fText); - } - break mainLoop; - } - mode = RBBI_END; - category = 1; - } - - if (mode == RBBI_RUN) { - // look up the current character's category, which tells us - // which column in the state table to look at. - // - category = (short) fRData.fTrie.getCodePointValue(c); - - // Check the dictionary bit in the character's category. - // Counter is only used by dictionary based iterators (subclasses). - // Chars that need to be handled by a dictionary have a flag bit set - // in their category values. - // - if ((category & 0x4000) != 0) { - fDictionaryCharCount++; - // And off the dictionary flag bit. - category &= ~0x4000; - } - } - - - if (TRACE) { - System.out.print(" " + fText.getIndex() + " "); - if (0x20 <= c && c < 0x7f) { - System.out.print(" " + c + " "); - } else { - System.out.print(" " + Integer.toHexString(c) + " "); + if (c == DONE32) { + // Reached end of input string. + if (mode == RBBI_END || fRData.fHeader.fVersion == 1) { + // Either this is the old (ICU 3.2 and earlier) format data which + // does not support explicit support for matching {eof}, or + // we have already done the {eof} iteration. Now is the time + // to unconditionally bail out. + if (result == initialPosition) { + // Ran off start, no match found. + // Move one position (towards the start, since we are doing previous.) + fText.setIndex(initialPosition); + previous32(fText); } - System.out.println(" " + state + " " + category + " "); + break mainLoop; } - - // State Transition - move machine to its next state + mode = RBBI_END; + category = 1; + } + + if (mode == RBBI_RUN) { + // look up the current character's category, which tells us + // which column in the state table to look at. + // + category = (short) fRData.fTrie.getCodePointValue(c); + + // Check the dictionary bit in the character's category. + // Counter is only used by dictionary based iterators (subclasses). + // Chars that need to be handled by a dictionary have a flag bit set + // in their category values. // - state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; - row = fRData.getRowIndex(state); - - if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) { - // Match found, common case, could have lookahead so we move - // on to check it - result = fText.getIndex(); + if ((category & 0x4000) != 0) { + fDictionaryCharCount++; + // And off the dictionary flag bit. + category &= ~0x4000; } - - if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) { - if (lookaheadStatus != 0 - && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) { - // Lookahead match is completed. Set the result - // accordingly, but only - // if no other rule has matched further in the mean - // time. - result = lookaheadResult; - lookaheadStatus = 0; - // TODO: make a stand-alone hard break in a rule work. - - if (lookAheadHardBreak) { - break mainLoop; - } - // Look-ahead completed, but other rules may match further. - // Continue on. - // TODO: junk this feature? I don't think that it's used anywhere. - break innerBlock; - } - // Hit a possible look-ahead match. We are at the - // position of the '/'. Remember this position. - lookaheadResult = fText.getIndex(); - lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; - break innerBlock; - } - - // not lookahead... - if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) { - // This is a plain (non-look-ahead) accepting state. - if (!lookAheadHardBreak) { - // Clear out any pending look-ahead matches, - // but only if not doing the lookAheadHardBreak option - // which needs to force a break no matter what is going - // on with the rest of the match, i.e. we can't abandon - // a partially completed look-ahead match because - // some other rule matched further than the '/' position - // in the look-ahead match. - lookaheadStatus = 0; - } + } + + + if (TRACE) { + System.out.print(" " + fText.getIndex() + " "); + if (0x20 <= c && c < 0x7f) { + System.out.print(" " + c + " "); + } else { + System.out.print(" " + Integer.toHexString(c) + " "); + } + System.out.println(" " + state + " " + category + " "); + } + + // State Transition - move machine to its next state + // + state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; + row = fRData.getRowIndex(state); + + if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) { + // Match found, common case, could have lookahead so we move + // on to check it + result = fText.getIndex(); + } + + + int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING]; + if (completedRule > 0) { + // Lookahead match is completed. + int lookaheadResult = fLookAheadMatches.getPosition(completedRule); + if (lookaheadResult >= 0) { + result = lookaheadResult; + break mainLoop; } - - } // end of innerBlock. "break innerBlock" in above code comes out here. - - + } + int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; + if (rule != 0) { + // At the position of a '/' in a look-ahead match. Record it. + int pos = fText.getIndex(); + fLookAheadMatches.setPosition(rule, pos); + } + if (state == STOP_STATE) { // Normal loop exit is here break mainLoop; } - + // then move iterator position backwards one character // if (mode == RBBI_RUN) { @@ -1531,10 +1517,10 @@ public class RuleBasedBreakIterator extends BreakIterator { mode = RBBI_RUN; } } - - + + } // End of the main loop. - + // The state machine is done. Check whether it found a match... // // If the iterator failed to advance in the match engine, force it ahead by one. @@ -1545,12 +1531,12 @@ public class RuleBasedBreakIterator extends BreakIterator { previous32(fText); result = fText.getIndex(); } - + fText.setIndex(result); if (TRACE) { System.out.println("Result = " + result); } - + return result; } } diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar index fe5766bccc7..31108dd39f3 100755 --- a/icu4j/main/shared/data/icudata.jar +++ b/icu4j/main/shared/data/icudata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:848a445cb828689cd5bca20bfd321db5503ef66c0a94d929fc108a28d0c5595f -size 11754757 +oid sha256:eb9182edec08706f02236909aaefcbf4c98d29d6415d1a8801633233c74f03fb +size 11789631 diff --git a/icu4j/main/shared/data/icutzdata.jar b/icu4j/main/shared/data/icutzdata.jar index 132d2c7fe68..b966701e305 100755 --- a/icu4j/main/shared/data/icutzdata.jar +++ b/icu4j/main/shared/data/icutzdata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a75dfbe25f7671a65bb933aed49a71eb9a923767687625982603c54860478ce7 +oid sha256:cefefda6f12f61e7dcd7767a7b07b0fea3ca53c2a9b1524f3627e94cad6f3ee0 size 90259 diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java index 70f7edda17d..d217399fd20 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java @@ -1,6 +1,6 @@ /* ******************************************************************************* - * Copyright (C) 2003-2015 International Business Machines Corporation and + * Copyright (C) 2003-2016 International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ @@ -26,18 +26,18 @@ import com.ibm.icu.text.UnicodeSet; * Monkey tests for RBBI. These tests have independent implementations of * the Unicode TR boundary rules, and compare results between these and ICU's * implementation, using random data. - * + * * Tests cover Grapheme Cluster (char), Word and Line breaks - * + * * Ported from ICU4C, original code in file source/test/intltest/rbbitst.cpp * */ public class RBBITestMonkey extends TestFmwk { - + public static void main(String[] args) { new RBBITestMonkey().run(args); } - + // // classs RBBIMonkeyKind // @@ -49,7 +49,7 @@ public class RBBITestMonkey extends TestFmwk { // testing, but works purely in terms of the interface defined here. // abstract static class RBBIMonkeyKind { - + // Return a List of UnicodeSets, representing the character classes used // for this type of iterator. abstract List charClasses(); @@ -60,14 +60,14 @@ public class RBBITestMonkey extends TestFmwk { // Find the next break position, starting from the specified position. // Return -1 after reaching end of string. abstract int next(int i); - + // A Character Property, one of the constants defined in class UProperty. // The value of this property will be displayed for the characters - // near any test failure. + // near any test failure. int fCharProperty; } - + /** * Monkey test subclass for testing Character (Grapheme Cluster) boundaries. * Note: As of Unicode 6.1, fPrependSet is empty, so don't add it to fSets @@ -88,6 +88,11 @@ public class RBBITestMonkey extends TestFmwk { UnicodeSet fLVTSet; UnicodeSet fHangulSet; UnicodeSet fAnySet; + UnicodeSet fEmojiModifierSet; + UnicodeSet fEmojiBaseSet; + UnicodeSet fZWJSet; + UnicodeSet fGAZSet; + StringBuffer fText; @@ -96,8 +101,8 @@ public class RBBITestMonkey extends TestFmwk { fText = null; fCharProperty = UProperty.GRAPHEME_CLUSTER_BREAK; fCRLFSet = new UnicodeSet("[\\r\\n]"); - fControlSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Control}]"); - fExtendSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]"); + fControlSet = new UnicodeSet("[[\\p{Grapheme_Cluster_Break = Control}-[:Block=Tags:]]]"); + fExtendSet = new UnicodeSet("[[\\p{Grapheme_Cluster_Break = Extend}][:Block=Tags:]]"); fRegionalIndicatorSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"); fPrependSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Prepend}]"); fSpacingSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = SpacingMark}]"); @@ -115,6 +120,17 @@ public class RBBITestMonkey extends TestFmwk { fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]"); + fEmojiBaseSet = new UnicodeSet( + "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443" + + "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483" + + "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647" + + "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"); + + fEmojiModifierSet = new UnicodeSet(0x0001F3FB, 0x0001F3FF); + fZWJSet = new UnicodeSet(0x200D, 0x200D); + fGAZSet = new UnicodeSet("[\\U0001F466-\\U0001F469\\U0001F48B\\U0001F5E8\\u2764]"); + + fSets = new ArrayList(); fSets.add(fCRLFSet); fSets.add(fControlSet); @@ -126,44 +142,49 @@ public class RBBITestMonkey extends TestFmwk { fSets.add(fSpacingSet); fSets.add(fHangulSet); fSets.add(fAnySet); + fSets.add(fEmojiBaseSet); + fSets.add(fEmojiModifierSet); + fSets.add(fZWJSet); + fSets.add(fGAZSet); } void setText(StringBuffer s) { fText = s; } - + List charClasses() { return fSets; } - + int next(int prevPos) { - int p1, p2, p3; // Indices of the significant code points around the - // break position being tested. The candidate break - // location is before p2. - + int p0, p1, p2, p3; // Indices of the significant code points around the + // break position being tested. The candidate break + // location is before p2. + int breakPos = -1; - - int c1, c2, c3; // The code points at p0, p1, p2 & p3. - + + int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. + // Previous break at end of string. return DONE. if (prevPos >= fText.length()) { return -1; } - p1 = p2 = p3 = prevPos; + p0 = p1 = p2 = p3 = prevPos; c3 = UTF16.charAt(fText, prevPos); - c1 = c2 = 0; - + c0 = c1 = c2 = 0; + // Loop runs once per "significant" character position in the input text. for (;;) { // Move all of the positions forward in the input string. + p0 = p1; c0 = c1; p1 = p2; c1 = c2; p2 = p3; c2 = c3; - + // Advance p3 by one codepoint p3 = moveIndex32(fText, p3, 1); c3 = (p3>=fText.length())? -1: UTF16.charAt(fText, p3); - + if (p1 == p2) { // Still warming up the loop. (won't work with zero length strings, but we don't care) continue; @@ -172,7 +193,7 @@ public class RBBITestMonkey extends TestFmwk { // Reached end of string. Always a break position. break; } - + // Rule GB3 CR x LF // No Extend or Format characters may appear between the CR and LF, // which requires the additional check for p2 immediately following p1. @@ -180,14 +201,14 @@ public class RBBITestMonkey extends TestFmwk { if (c1==0x0D && c2==0x0A && p1==(p2-1)) { continue; } - + // Rule (GB4). ( Control | CR | LF ) if (fControlSet.contains(c1) || c1 == 0x0D || c1 == 0x0A) { break; } - + // Rule (GB5) ( Control | CR | LF ) // if (fControlSet.contains(c2) || @@ -195,8 +216,8 @@ public class RBBITestMonkey extends TestFmwk { c2 == 0x0A) { break; } - - + + // Rule (GB6) L x ( L | V | LV | LVT ) if (fLSet.contains(c1) && (fLSet.contains(c2) || @@ -205,43 +226,61 @@ public class RBBITestMonkey extends TestFmwk { fLVTSet.contains(c2))) { continue; } - + // Rule (GB7) ( LV | V ) x ( V | T ) if ((fLVSet.contains(c1) || fVSet.contains(c1)) && (fVSet.contains(c2) || fTSet.contains(c2))) { continue; } - + // Rule (GB8) ( LVT | T) x T if ((fLVTSet.contains(c1) || fTSet.contains(c1)) && fTSet.contains(c2)) { continue; } - + // Rule (GB8a) Regional_Indicator x Regional_Indicator + // Note: The first if condition is a little tricky. We only need to force + // a break if there are three or more contiguous RIs. If there are + // only two, a break following will occur via other rules, and will include + // any trailing extend characters, which is needed behavior. + if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1) + && fRegionalIndicatorSet.contains(c2)) { + break; + } + if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) { continue; } - - // Rule (GB9) Numeric x ALetter - if (fExtendSet.contains(c2)) { + + // Rule (GB9) x Extend + if (fExtendSet.contains(c2) || fZWJSet.contains(c2)) { continue; } - + // Rule (GB9a) x SpacingMark if (fSpacingSet.contains(c2)) { continue; } - + // Rule (GB9b) Prepend x if (fPrependSet.contains(c1)) { continue; } - + // Rule (GB9c) Emoji_Base x Emoji_Modifier + if ((fEmojiBaseSet.contains(c1) || fGAZSet.contains(c1)) && fEmojiModifierSet.contains(c2)) { + continue; + } + + // Rule (GB9d) ZWJ x Glue_After_Zwj + if (fZWJSet.contains(c1) && fGAZSet.contains(c2)) { + continue; + } + // Rule (GB10) Any Any break; } - + breakPos = p2; return breakPos; } @@ -249,11 +288,11 @@ public class RBBITestMonkey extends TestFmwk { /** - * + * * Word Monkey Test Class * - * - * + * + * */ static class RBBIWordMonkey extends RBBIMonkeyKind { List fSets; @@ -275,10 +314,14 @@ public class RBBITestMonkey extends TestFmwk { UnicodeSet fFormatSet; UnicodeSet fExtendSet; UnicodeSet fExtendNumLetSet; - UnicodeSet fOtherSet; + UnicodeSet fOtherSet; UnicodeSet fDictionaryCjkSet; + UnicodeSet fEBaseSet; + UnicodeSet fEModifierSet; + UnicodeSet fZWSSet; + UnicodeSet fGAZSet; + - RBBIWordMonkey() { fCharProperty = UProperty.WORD_BREAK; @@ -286,13 +329,13 @@ public class RBBITestMonkey extends TestFmwk { fCRSet = new UnicodeSet("[\\p{Word_Break = CR}]"); fLFSet = new UnicodeSet("[\\p{Word_Break = LF}]"); fNewlineSet = new UnicodeSet("[\\p{Word_Break = Newline}]"); - fRegionalIndicatorSet = new UnicodeSet("[\\p{Word_Break = Regional_Indicator}]"); + fRegionalIndicatorSet = new UnicodeSet("[\\p{Word_Break = Regional_Indicator}]"); fKatakanaSet = new UnicodeSet("[\\p{Word_Break = Katakana}]"); - fHebrew_LetterSet = new UnicodeSet("[\\p{Word_Break = Hebrew_Letter}]"); + fHebrew_LetterSet = new UnicodeSet("[\\p{Word_Break = Hebrew_Letter}]"); fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}]"); fALetterSet.removeAll(fDictionaryCjkSet); fSingle_QuoteSet = new UnicodeSet("[\\p{Word_Break = Single_Quote}]"); - fDouble_QuoteSet = new UnicodeSet("[\\p{Word_Break = Double_Quote}]"); + fDouble_QuoteSet = new UnicodeSet("[\\p{Word_Break = Double_Quote}]"); fMidNumLetSet = new UnicodeSet("[\\p{Word_Break = MidNumLet}]"); fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter}]"); fMidNumSet = new UnicodeSet("[\\p{Word_Break = MidNum}]"); @@ -300,6 +343,16 @@ public class RBBITestMonkey extends TestFmwk { fFormatSet = new UnicodeSet("[\\p{Word_Break = Format}]"); fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]"); fExtendSet = new UnicodeSet("[\\p{Word_Break = Extend}]"); + fEBaseSet = new UnicodeSet( + "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443" + + "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483" + + "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647" + + "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"); + + fEModifierSet = new UnicodeSet("[\\U0001F3FB-\\U0001F3FF]"); + fZWSSet = new UnicodeSet(0x200D, 0x200D); + fGAZSet = new UnicodeSet("[\\U0001F466-\\U0001F469\\U0001F48B\\U0001F5E8\\u2764]"); + fExtendSet.removeAll(fZWSSet); fOtherSet = new UnicodeSet(); fOtherSet.complement(); @@ -318,6 +371,11 @@ public class RBBITestMonkey extends TestFmwk { fOtherSet.removeAll(fExtendSet); fOtherSet.removeAll(fExtendNumLetSet); fOtherSet.removeAll(fRegionalIndicatorSet); + fOtherSet.removeAll(fEBaseSet); + fOtherSet.removeAll(fEModifierSet); + fOtherSet.removeAll(fZWSSet); + fOtherSet.removeAll(fGAZSet); + // Inhibit dictionary characters from being tested at all. // remove surrogates so as to not generate higher CJK characters fOtherSet.removeAll(new UnicodeSet("[[\\p{LineBreak = Complex_Context}][:Line_Break=Surrogate:]]")); @@ -342,24 +400,24 @@ public class RBBITestMonkey extends TestFmwk { fSets.add(fExtendNumLetSet); fSets.add(fOtherSet); } - - + + List charClasses() { - return fSets; + return fSets; + } + + void setText(StringBuffer s) { + fText = s; } - - void setText(StringBuffer s) { - fText = s; - } - int next(int prevPos) { - int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the + int next(int prevPos) { + int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the // break position being tested. The candidate break // location is before p2. int breakPos = -1; - + int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. - + // Previous break at end of string. return DONE. if (prevPos >= fText.length()) { return -1; @@ -367,8 +425,8 @@ public class RBBITestMonkey extends TestFmwk { /*p0 =*/ p1 = p2 = p3 = prevPos; c3 = UTF16.charAt(fText, prevPos); c0 = c1 = c2 = 0; - - + + // Loop runs once per "significant" character position in the input text. for (;;) { @@ -376,7 +434,7 @@ public class RBBITestMonkey extends TestFmwk { /*p0 = p1;*/ c0 = c1; p1 = p2; c1 = c2; p2 = p3; c2 = c3; - + // Advance p3 by X(Extend | Format)* Rule 4 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change) do { @@ -390,7 +448,7 @@ public class RBBITestMonkey extends TestFmwk { break; } } - while (setContains(fFormatSet, c3) || setContains(fExtendSet, c3)); + while (setContains(fFormatSet, c3) || setContains(fExtendSet, c3) || setContains(fZWSSet, c3)); if (p1 == p2) { // Still warming up the loop. (won't work with zero length strings, but we don't care) @@ -408,7 +466,7 @@ public class RBBITestMonkey extends TestFmwk { if (c1==0x0D && c2==0x0A) { continue; } - + // Rule (3a) Break before and after newlines (including CR and LF) // if (fCRSet.contains(c1) || fLFSet.contains(c1) || fNewlineSet.contains(c1)) { @@ -418,12 +476,19 @@ public class RBBITestMonkey extends TestFmwk { break; } + // Rule (3c) ZWJ x GAZ (Glue after ZWJ). + // Not ignoring extend chars, so peek into input text to + // get the potential ZWJ, the character immediately preceding c2. + if (fZWSSet.contains(fText.codePointBefore(p2)) && fGAZSet.contains(c2)) { + continue; + } + // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter) if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) && (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) { continue; } - + // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter) // if ( (fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) && @@ -453,13 +518,13 @@ public class RBBITestMonkey extends TestFmwk { if (fHebrew_LetterSet.contains(c0) && fDouble_QuoteSet.contains(c1) && fHebrew_LetterSet.contains(c2)) { continue; } - + // Rule (8) Numeric x Numeric if (fNumericSet.contains(c1) && fNumericSet.contains(c2)) { continue; } - + // Rule (9) (ALetter | Hebrew_Letter) x Numeric if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) && fNumericSet.contains(c2)) { @@ -478,14 +543,14 @@ public class RBBITestMonkey extends TestFmwk { fNumericSet.contains(c2)) { continue; } - + // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric if (fNumericSet.contains(c1) && (fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2)) && setContains(fNumericSet, c3)) { continue; } - + // Rule (13) Katakana x Katakana if (fKatakanaSet.contains(c1) && fKatakanaSet.contains(c2)) { @@ -498,7 +563,7 @@ public class RBBITestMonkey extends TestFmwk { fExtendNumLetSet.contains(c2)) { continue; } - + // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana) if (fExtendNumLetSet.contains(c1) && (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2) || @@ -506,32 +571,40 @@ public class RBBITestMonkey extends TestFmwk { continue; } - - // Rule 13c Do not break between Regional Indicators. + + // Rule 13c Do not break between Regional Indicators. // Regional_Indicator × Regional_Indicator + if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1)) { + break; + } if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) { continue; } - + + // Rule 13d + if ((fEBaseSet.contains(c1) || fGAZSet.contains(c1)) && fEModifierSet.contains(c2)) { + continue; + } + // Rule 14. Break found here. break; } - + breakPos = p2; return breakPos; } - + } - + static class RBBILineMonkey extends RBBIMonkeyKind { - + List fSets; - + // UnicodeSets for each of the Line Breaking character classes. // Order matches that of Unicode UAX 14, Table 1, which makes it a little easier // to verify that they are all accounted for. - + UnicodeSet fBK; UnicodeSet fCR; UnicodeSet fLF; @@ -570,19 +643,21 @@ public class RBBITestMonkey extends TestFmwk { UnicodeSet fJV; UnicodeSet fJT; UnicodeSet fRI; - UnicodeSet fSA; UnicodeSet fXX; - + UnicodeSet fEB; + UnicodeSet fEM; + UnicodeSet fZJ; + StringBuffer fText; int fOrigPositions; - - - + + + RBBILineMonkey() { fCharProperty = UProperty.LINE_BREAK; fSets = new ArrayList(); - + fBK = new UnicodeSet("[\\p{Line_Break=BK}]"); fCR = new UnicodeSet("[\\p{Line_break=CR}]"); fLF = new UnicodeSet("[\\p{Line_break=LF}]"); @@ -621,23 +696,33 @@ public class RBBITestMonkey extends TestFmwk { fJV = new UnicodeSet("[\\p{Line_break=JV}]"); fJT = new UnicodeSet("[\\p{Line_break=JT}]"); fRI = new UnicodeSet("[\\p{Line_break=RI}]"); - fSA = new UnicodeSet("[\\p{Line_break=SA}]"); fXX = new UnicodeSet("[\\p{Line_break=XX}]"); + fEB = new UnicodeSet( + "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443" + + "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483" + + "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647" + + "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"); + fEM = new UnicodeSet("[\\U0001F3FB-\\U0001F3FF]"); + fZJ = new UnicodeSet(0x200D, 0x200D); // Remove dictionary characters. // The monkey test reference implementation of line break does not replicate the dictionary behavior, // so dictionary characters are omitted from the monkey test data. UnicodeSet dictionarySet = new UnicodeSet( "[[:LineBreak = Complex_Context:] & [[:Script = Thai:][:Script = Lao:][:Script = Khmer:] [:script = Myanmar:]]]"); - fSA.removeAll(dictionarySet); fAL.addAll(fXX); // Default behavior for XX is identical to AL fAL.addAll(fAI); // Default behavior for AI is identical to AL - fAL.addAll(fSA); // Default behavior for SA is XX, which defaults to AL fAL.addAll(fSG); // Default behavior for SG (unpaired surrogates) is AL - + fNS.addAll(fCJ); // Default behavior for CJ is identical to NS. - + + fID.addAll(fEB); // Emoji Base and Emoji Modifier behave as ID. + fID.addAll(fEM); + fAL.removeAll(fEM); + fAL.remove(0x2764); // Emoji Proposal: move u2764 from AL to ID + fID.add(0x2764); + fSets.add(fBK); fSets.add(fCR); fSets.add(fLF); @@ -674,47 +759,50 @@ public class RBBITestMonkey extends TestFmwk { fSets.add(fHL); fSets.add(fID); fSets.add(fWJ); - fSets.add(fSA); - fSets.add(fSG); fSets.add(fRI); + fSets.add(fSG); + fSets.add(fEB); + fSets.add(fEM); + fSets.add(fZJ); + } - + void setText(StringBuffer s) { fText = s; } - - - + + + int next(int startPos) { int pos; // Index of the char following a potential break position int thisChar; // Character at above position "pos" - + int prevPos; // Index of the char preceding a potential break position int prevChar; // Character at above position. Note that prevChar // and thisChar may not be adjacent because combining // characters between them will be ignored. int prevCharX2; // Character before prevChar, more contex for LB 21a - + int nextPos; // Index of the next character following pos. // Usually skips over combining marks. int tPos; // temp value. int matchVals[] = null; // Number Expression Match Results - - + + if (startPos >= fText.length()) { return -1; } - - + + // Initial values for loop. Loop will run the first time without finding breaks, // while the invalid values shift out and the "this" and // "prev" positions are filled in with good values. pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration. thisChar = prevChar = prevCharX2 = 0; nextPos = startPos; - - + + // Loop runs once per position in the test text, until a break position // is found. In each iteration, we are testing for a possible break // just preceding the character at index "pos". The character preceding @@ -727,28 +815,28 @@ public class RBBITestMonkey extends TestFmwk { prevChar = thisChar; pos = nextPos; nextPos = moveIndex32(fText, pos, 1); - + // Rule LB2 - Break at end of text. if (pos >= fText.length()) { break; } - + // Rule LB 9 - adjust for combining sequences. // We do this rule out-of-order because the adjustment does // not effect the way that rules LB 3 through LB 6 match, // and doing it here rather than after LB 6 is substantially // simpler when combining sequences do occur. - - + + // LB 9 Keep combining sequences together. - // advance over any CM class chars at "pos", + // advance over any CM class chars at "pos", // result is "nextPos" for the following loop iteration. thisChar = UTF16.charAt(fText, pos); if (!(fSP.contains(thisChar) || fBK.contains(thisChar) || thisChar==0x0d || thisChar==0x0a || fNL.contains(thisChar) || fZW.contains(thisChar) )) { for (;;) { if (nextPos == fText.length()) { - break; + break; } int nextChar = UTF16.charAt(fText, nextPos); if (!fCM.contains(nextChar)) { @@ -757,28 +845,28 @@ public class RBBITestMonkey extends TestFmwk { nextPos = moveIndex32(fText, nextPos, 1); } } - + // LB 9 Treat X CM* as if it were X // No explicit action required. - + // LB 10 Treat any remaining combining mark as AL if (fCM.contains(thisChar)) { - thisChar = 'A'; + thisChar = 'A'; } - + // If the loop is still warming up - if we haven't shifted the initial // -1 positions out of prevPos yet - loop back to advance the // position in the input without any further looking for breaks. if (prevPos == -1) { continue; } - + // LB 4 Always break after hard line breaks, if (fBK.contains(prevChar)) { break; } - + // LB 5 Break after CR, LF, NL, but not inside CR LF if (fCR.contains(prevChar) && fLF.contains(thisChar)) { continue; @@ -788,46 +876,57 @@ public class RBBITestMonkey extends TestFmwk { fNL.contains(prevChar)) { break; } - + // LB 6 Don't break before hard line breaks if (fBK.contains(thisChar) || fCR.contains(thisChar) || fLF.contains(thisChar) || fNL.contains(thisChar) ) { continue; } - - + + // LB 7 Don't break before spaces or zero-width space. if (fSP.contains(thisChar)) { continue; } - + if (fZW.contains(thisChar)) { continue; } - + // LB 8 Break after zero width space if (fZW.contains(prevChar)) { break; } - + + // LB 8a ZJ x ID + // The monkey test's way of ignoring combining characters doesn't work + // for this rule. ZJ is also a CM. Need to get the actual character + // preceding "thisChar", not ignoring combining marks, possibly ZJ. + { + int prevC = fText.codePointBefore(pos); + if (fZJ.contains(prevC) && fID.contains(thisChar)) { + continue; + } + } + // LB 9, 10 Already done, at top of loop. // - - + + // LB 11 // x WJ // WJ x if (fWJ.contains(thisChar) || fWJ.contains(prevChar)) { continue; } - - + + // LB 12 // GL x if (fGL.contains(prevChar)) { continue; } - + // LB 12a // [^SP BA HY] x GL if (!(fSP.contains(prevChar) || @@ -836,8 +935,8 @@ public class RBBITestMonkey extends TestFmwk { continue; } - - + + // LB 13 Don't break before closings. // NU x CL, NU x CP and NU x IS are not matched here so that they will // fall into LB 17 and the more general number regular expression. @@ -849,7 +948,7 @@ public class RBBITestMonkey extends TestFmwk { !fNU.contains(prevChar) && fSY.contains(thisChar)) { continue; } - + // LB 14 Don't break after OP SP* // Scan backwards, checking for this sequence. // The OP char could include combining marks, so we actually check for @@ -866,8 +965,8 @@ public class RBBITestMonkey extends TestFmwk { if (fOP.contains(UTF16.charAt(fText, tPos))) { continue; } - - // LB 15 Do not break within "[ + + // LB 15 Do not break within "[ // QU CM* SP* x OP if (fOP.contains(thisChar)) { // Scan backwards from prevChar to see if it is preceded by QU CM* SP* @@ -881,8 +980,8 @@ public class RBBITestMonkey extends TestFmwk { if (fQU.contains(UTF16.charAt(fText, tPos))) { continue; } - } - + } + // LB 16 (CL | CP) SP* x NS if (fNS.contains(thisChar)) { tPos = prevPos; @@ -895,9 +994,9 @@ public class RBBITestMonkey extends TestFmwk { if (fCL.contains(UTF16.charAt(fText, tPos)) || fCP.contains(UTF16.charAt(fText, tPos))) { continue; } - } - - + } + + // LB 17 B2 SP* x B2 if (fB2.contains(thisChar)) { tPos = prevPos; @@ -910,25 +1009,25 @@ public class RBBITestMonkey extends TestFmwk { if (fB2.contains(UTF16.charAt(fText, tPos))) { continue; } - } - + } + // LB 18 break after space if (fSP.contains(prevChar)) { break; } - + // LB 19 // x QU // QU x if (fQU.contains(thisChar) || fQU.contains(prevChar)) { continue; } - + // LB 20 Break around a CB if (fCB.contains(thisChar) || fCB.contains(prevChar)) { break; } - + // LB 21 if (fBA.contains(thisChar) || fHY.contains(thisChar) || @@ -936,7 +1035,7 @@ public class RBBITestMonkey extends TestFmwk { fBB.contains(prevChar) ) { continue; } - + // LB 21a, HL (HY | BA) x if (fHL.contains(prevCharX2) && (fHY.contains(prevChar) || fBA.contains(prevChar))) { continue; @@ -946,7 +1045,7 @@ public class RBBITestMonkey extends TestFmwk { if (fSY.contains(prevChar) && fHL.contains(thisChar)) { continue; } - + // LB 22 if (fAL.contains(prevChar) && fIN.contains(thisChar) || fEX.contains(prevChar) && fIN.contains(thisChar) || @@ -956,8 +1055,8 @@ public class RBBITestMonkey extends TestFmwk { fNU.contains(prevChar) && fIN.contains(thisChar) ) { continue; } - - + + // LB 23 ID x PO (Note: Leading CM behaves like ID) // AL x NU // NU x AL @@ -968,7 +1067,7 @@ public class RBBITestMonkey extends TestFmwk { fNU.contains(prevChar) && fHL.contains(thisChar) ) { continue; } - + // LB 24 Do not break between prefix and letters or ideographs. // PR x ID // PR x AL @@ -978,8 +1077,8 @@ public class RBBITestMonkey extends TestFmwk { fPO.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) { continue; } - - + + // LB 25 Numbers matchVals = LBNumberCheck(fText, prevPos, matchVals); if (matchVals[0] != -1) { @@ -995,7 +1094,7 @@ public class RBBITestMonkey extends TestFmwk { nextPos = numEndIdx; pos = numEndIdx; do { - pos = moveIndex32(fText, pos, -1); + pos = moveIndex32(fText, pos, -1); thisChar = UTF16.charAt(fText, pos); } while (fCM.contains(thisChar)); @@ -1003,8 +1102,8 @@ public class RBBITestMonkey extends TestFmwk { continue; } } - - + + // LB 26 Do not break Korean Syllables if (fJL.contains(prevChar) && (fJL.contains(thisChar) || fJV.contains(thisChar) || @@ -1039,18 +1138,18 @@ public class RBBITestMonkey extends TestFmwk { continue; } - - + + // LB 28 Do not break between alphabetics if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && (fAL.contains(thisChar) || fHL.contains(thisChar))) { continue; } - + // LB 29 Do not break between numeric punctuation and alphabetics if (fIS.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) { continue; } - + // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation. // (AL | NU) x OP // CP x (AL | NU) @@ -1061,20 +1160,29 @@ public class RBBITestMonkey extends TestFmwk { continue; } - // LB 30a Do not break between regional indicators. RI × RI + // LB 30a Break between pairs of Regional Indicators. + // RI RI RI + // RI x RI + if (fRI.contains(prevCharX2) && fRI.contains(prevChar) && fRI.contains(thisChar)) { + break; + } if (fRI.contains(prevChar) && fRI.contains(thisChar)) { continue; } - + + // LB30b Emoji Base x Emoji Modifier + if (fEB.contains(prevChar) && fEM.contains(thisChar)) { + continue; + } // LB 31 Break everywhere else - break; + break; } - + return pos; } - - - + + + // Match the following regular expression in the input text. // ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * ((CL | CP) CM*)? (PR | PO) CM*)? // 0 0 1 3 3 4 7 7 7 7 9 9 9 11 11 (match states) @@ -1090,15 +1198,15 @@ public class RBBITestMonkey extends TestFmwk { retVals[0] = -1; // Indicates no match. int matchState = 0; int idx = startIdx; - + matchLoop: for (idx = startIdx; idx 4) { - retVals[0] = startIdx; - retVals[1] = idx; + retVals[0] = startIdx; + retVals[1] = idx; } return retVals; } - - + + List charClasses() { return fSets; } - - - + + + } - + /** - * + * * Sentence Monkey Test Class * - * - * + * + * */ static class RBBISentenceMonkey extends RBBIMonkeyKind { List fSets; @@ -1247,8 +1355,8 @@ public class RBBITestMonkey extends TestFmwk { UnicodeSet fOtherSet; UnicodeSet fExtendSet; - - + + RBBISentenceMonkey() { fCharProperty = UProperty.SENTENCE_BREAK; @@ -1301,26 +1409,26 @@ public class RBBITestMonkey extends TestFmwk { fSets.add(fOtherSet); fSets.add(fExtendSet); } - - + + List charClasses() { - return fSets; + return fSets; + } + + void setText(StringBuffer s) { + fText = s; } - - void setText(StringBuffer s) { - fText = s; - } - + // moveBack() Find the "significant" code point preceding the index i. // Skips over ($Extend | $Format)* - // + // private int moveBack(int i) { - + if (i <= 0) { return -1; } - + int c; int j = i; do { @@ -1330,8 +1438,8 @@ public class RBBITestMonkey extends TestFmwk { while (j>0 &&(fFormatSet.contains(c) || fExtendSet.contains(c))); return j; } - - + + int moveForward(int i) { if (i>=fText.length()) { return fText.length(); @@ -1344,9 +1452,9 @@ public class RBBITestMonkey extends TestFmwk { } while (c>=0 && (fFormatSet.contains(c) || fExtendSet.contains(c))); return j; - + } - + int cAt(int pos) { if (pos<0 || pos>=fText.length()) { return -1; @@ -1354,15 +1462,15 @@ public class RBBITestMonkey extends TestFmwk { return UTF16.charAt(fText, pos); } - int next(int prevPos) { - int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the + int next(int prevPos) { + int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the // break position being tested. The candidate break // location is before p2. int breakPos = -1; - + int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. int c; - + // Prev break at end of string. return DONE. if (prevPos >= fText.length()) { return -1; @@ -1370,28 +1478,28 @@ public class RBBITestMonkey extends TestFmwk { /*p0 =*/ p1 = p2 = p3 = prevPos; c3 = UTF16.charAt(fText, prevPos); c0 = c1 = c2 = 0; - + // Loop runs once per "significant" character position in the input text. for (;;) { // Move all of the positions forward in the input string. /*p0 = p1;*/ c0 = c1; p1 = p2; c1 = c2; p2 = p3; c2 = c3; - + // Advancd p3 by X(Extend | Format)* Rule 4 p3 = moveForward(p3); c3 = cAt(p3); - + // Rule (3) CR x LF if (c1==0x0d && c2==0x0a && p2==(p1+1)) { continue; } - + // Rule (4) Sep if (fSepSet.contains(c1)) { p2 = p1+1; // Separators don't combine with Extend or Format break; - } + } if (p2 >= fText.length()) { // Reached end of string. Always a break position. @@ -1415,7 +1523,7 @@ public class RBBITestMonkey extends TestFmwk { } // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep))* Lower - // Note: Sterm | ATerm are added to the negated part of the expression by a + // Note: Sterm | ATerm are added to the negated part of the expression by a // note to the Unicode 5.0 documents. int p8 = p1; while (p8>0 && fSpSet.contains(cAt(p8))) { @@ -1430,7 +1538,7 @@ public class RBBITestMonkey extends TestFmwk { c = cAt(p8); if (c==-1 || fOLetterSet.contains(c) || fUpperSet.contains(c) || fLowerSet.contains(c) || fSepSet.contains(c) || - fATermSet.contains(c) || fSTermSet.contains(c)) + fATermSet.contains(c) || fSTermSet.contains(c)) { break; } @@ -1440,7 +1548,7 @@ public class RBBITestMonkey extends TestFmwk { continue; } } - + // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | Sterm | ATerm) if (fSContinueSet.contains(c2) || fSTermSet.contains(c2) || fATermSet.contains(c2)) { p8 = p1; @@ -1504,12 +1612,12 @@ public class RBBITestMonkey extends TestFmwk { breakPos = p2; return breakPos; } - - + + } - + /** * Move an index into a string by n code points. * Similar to UTF16.moveCodePointOffset, but without the exceptions, which were @@ -1526,35 +1634,35 @@ public class RBBITestMonkey extends TestFmwk { if (amt>0) { for (i=0; i= s.length()) { - return s.length(); + return s.length(); } c = s.charAt(pos); pos++; if (UTF16.isLeadSurrogate(c) && pos < s.length()) { c = s.charAt(pos); if (UTF16.isTrailSurrogate(c)) { - pos++; + pos++; } } } } else { for (i=0; i>amt; i--) { if (pos <= 0) { - return 0; + return 0; } pos--; c = s.charAt(pos); if (UTF16.isTrailSurrogate(c) && pos >= 0) { c = s.charAt(pos); if (UTF16.isLeadSurrogate(c)) { - pos--; + pos--; } } } } return pos; } - + /** * No-exceptions form of UnicodeSet.contains(c). * Simplifies loops that terminate with an end-of-input character value. @@ -1568,8 +1676,8 @@ public class RBBITestMonkey extends TestFmwk { } return s.contains(c); } - - + + /** * return the index of the next code point in the input text. * @param i the preceding index @@ -1589,8 +1697,8 @@ public class RBBITestMonkey extends TestFmwk { } return retVal; } - - + + /** * random number generator. Not using Java's built-in Randoms for two reasons: * 1. Using this code allows obtaining the same sequences as those from the ICU4C monkey test. @@ -1641,7 +1749,7 @@ public class RBBITestMonkey extends TestFmwk { } } - + /** * Run a RBBI monkey test. Common routine, for all break iterator types. * Parameters: @@ -1688,20 +1796,20 @@ void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int // Debugging settings. Comment out everything in the following block for normal operation // //-------------------------------------------------------------------------------------------- - // numIterations = -1; + // numIterations = -1; // RuleBasedBreakIterator_New.fTrace = true; // m_seed = 859056465; // TESTSTRINGLEN = 50; // printTestData = true; // printBreaksFromBI = true; // ((RuleBasedBreakIterator_New)bi).dump(); - + //-------------------------------------------------------------------------------------------- // - // End of Debugging settings. + // End of Debugging settings. // //-------------------------------------------------------------------------------------------- - + int dotsOnLine = 0; while (loopCount < numIterations || numIterations == -1) { if (numIterations == -1 && loopCount % 10 == 0) { @@ -1720,7 +1828,7 @@ void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int testText.setLength(0); // Populate a test string with data. if (printTestData) { - System.out.println("Test Data string ..."); + System.out.println("Test Data string ..."); } for (i=0; i= testText.length()) {break;} - if (expectedBreaks[endContext-1]) { + if (expectedBreaks[endContext-1]) { if (count == 0) break; count --; } @@ -1910,7 +2018,7 @@ void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int String gc = UCharacter.getPropertyValueName(UProperty.GENERAL_CATEGORY, UCharacter.getType(c), UProperty.NameChoice.SHORT); appendToBuf(errorText, gc, 8); int extraProp = UCharacter.getIntPropertyValue(c, mk.fCharProperty); - String extraPropValue = + String extraPropValue = UCharacter.getPropertyValueName(mk.fCharProperty, extraProp, UProperty.NameChoice.LONG); appendToBuf(errorText, extraPropValue, 20); @@ -1925,7 +2033,7 @@ void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int errorText.append("\n"); // Output the error - errln(name + " break monkey test error. " + + errln(name + " break monkey test error. " + (expectedBreaks[i]? "Break expected but not found." : "Break found but not expected.") + "\nOperation = " + errorType + "; random seed = " + seed + "; buf Idx = " + i + "\n" + errorText); @@ -1938,28 +2046,28 @@ void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int } public void TestCharMonkey() { - + int loopCount = 500; int seed = 1; - + if (params.inclusion >= 9) { loopCount = 10000; } - + RBBICharMonkey m = new RBBICharMonkey(); BreakIterator bi = BreakIterator.getCharacterInstance(Locale.US); RunMonkey(bi, m, "char", seed, loopCount); } public void TestWordMonkey() { - + int loopCount = 500; int seed = 1; - + if (params.inclusion >= 9) { loopCount = 10000; } - + logln("Word Break Monkey Test"); RBBIWordMonkey m = new RBBIWordMonkey(); BreakIterator bi = BreakIterator.getWordInstance(Locale.US); @@ -1969,11 +2077,11 @@ public void TestWordMonkey() { public void TestLineMonkey() { int loopCount = 500; int seed = 1; - + if (params.inclusion >= 9) { loopCount = 10000; } - + logln("Line Break Monkey Test"); RBBILineMonkey m = new RBBILineMonkey(); BreakIterator bi = BreakIterator.getLineInstance(Locale.US); @@ -1984,14 +2092,14 @@ public void TestLineMonkey() { } public void TestSentMonkey() { - + int loopCount = 500; int seed = 1; - + if (params.inclusion >= 9) { loopCount = 3000; } - + logln("Sentence Break Monkey Test"); RBBISentenceMonkey m = new RBBISentenceMonkey(); BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US); @@ -2011,14 +2119,14 @@ public void TestSentMonkey() { // rebuild break iterators from the original source rules. // public void TestRTCharMonkey() { - + int loopCount = 200; int seed = 1; - + if (params.inclusion >= 9) { loopCount = 2000; } - + RBBICharMonkey m = new RBBICharMonkey(); BreakIterator bi = BreakIterator.getCharacterInstance(Locale.US); String rules = bi.toString(); @@ -2027,10 +2135,10 @@ public void TestRTCharMonkey() { } public void TestRTWordMonkey() { - + int loopCount = 200; int seed = 1; - + if (params.inclusion >= 9) { loopCount = 2000; } @@ -2045,11 +2153,11 @@ public void TestRTWordMonkey() { public void TestRTLineMonkey() { int loopCount = 200; int seed = 1; - + if (params.inclusion >= 9) { loopCount = 2000; } - + logln("Line Break Monkey Test"); RBBILineMonkey m = new RBBILineMonkey(); BreakIterator bi = BreakIterator.getLineInstance(Locale.US); @@ -2062,14 +2170,14 @@ public void TestRTLineMonkey() { } public void TestRTSentMonkey() { - + int loopCount = 200; int seed = 1; - + if (params.inclusion >= 9) { loopCount = 1000; } - + logln("Sentence Break Monkey Test"); RBBISentenceMonkey m = new RBBISentenceMonkey(); BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US); -- 2.40.0