ICU-13196 RBBI Monkey Test, port to Java. Sync a few changes back to ICU4C.

author Andy Heninger <andy.heninger@gmail.com>

Tue, 1 Aug 2017 01:03:09 +0000 (01:03 +0000)

committer Andy Heninger <andy.heninger@gmail.com>

Tue, 1 Aug 2017 01:03:09 +0000 (01:03 +0000)
author Andy Heninger <andy.heninger@gmail.com>
Tue, 1 Aug 2017 01:03:09 +0000 (01:03 +0000)
committer Andy Heninger <andy.heninger@gmail.com>
Tue, 1 Aug 2017 01:03:09 +0000 (01:03 +0000)
diff --git a/icu4c/source/test/intltest/rbbimonkeytest.cpp b/icu4c/source/test/intltest/rbbimonkeytest.cpp

index 81335ccfc978e46b7a0357d74dc2111821a5a5da..e5eeeac2eb99721a5062d3ff21348e2647b1d633 100644 (file)
--- a/icu4c/source/test/intltest/rbbimonkeytest.cpp
+++ b/icu4c/source/test/intltest/rbbimonkeytest.cpp
@@ -73,7 +73,7 @@ BreakRules::BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status)  :
      fCharClassList.adoptInstead(new UVector(status));
  
      fSetRefsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
-             "(?!(?:\\{|=|\\[:)[ \\t]{0,4})"              // Negative lookbehind for '{' or '=' or '[:'
+             "(?!(?:\\{|=|\\[:)[ \\t]{0,4})"              // Negative look behind for '{' or '=' or '[:'
                                                            //   (the identifier is a unicode property name or value)
               "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"),     // The char class name
          0, status));
@@ -86,7 +86,7 @@ BreakRules::BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status)  :
                  "\\R$"                          //   new-line at end of line.
              ), 0, status));
  
-    // Match (initial parse) of a character class defintion line.
+    // Match (initial parse) of a character class definition line.
      fClassDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
                  "[ \\t]*"                                // leading white space
                  "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"   // The char class name
@@ -129,7 +129,7 @@ CharClass *BreakRules::addCharClass(const UnicodeString &name, const UnicodeStri
      }
      fSetRefsMatcher->appendTail(expandedDef);
  
-    // Verify that the expanded set defintion is valid.
+    // Verify that the expanded set definition is valid.
  
      if (fMonkeyImpl->fDumpExpansions) {
          printf("epandedDef: %s\n", CStr(expandedDef)());
@@ -149,7 +149,7 @@ CharClass *BreakRules::addCharClass(const UnicodeString &name, const UnicodeStri
  
      if (previousClass != NULL) {
          // Duplicate class def.
-        // These are legitimate, they are adustments of an existing class.
+        // These are legitimate, they are adjustments of an existing class.
          // TODO: will need to keep the old around when we handle tailorings.
          IntlTest::gTest->logln("Redefinition of character class %s\n", CStr(cclass->fName)());
          delete previousClass;
diff --git a/icu4c/source/test/testdata/break_rules/line.txt b/icu4c/source/test/testdata/break_rules/line.txt

index 268da1c99d14d4a0a2130cb00365dc793892f1be..5059d2d6e9bd5b50895a17355388464448341595 100644 (file)
--- a/icu4c/source/test/testdata/break_rules/line.txt
+++ b/icu4c/source/test/testdata/break_rules/line.txt
@@ -116,7 +116,7 @@ LB12:        GL CM* [^CM];
  
  LB12a:       [^SP BA HY] CM* GL;
  
-# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
+# LB 13 ICU Tailoring, matches tailoring example 8 from UAX 14.
  #
  #   LB13.1   [^SP] CM* [CL CP EX IS SY]    # original UAX 14 rule.
  #   LB13.2   SP    CM* [CL CP EX IS SY]
diff --git a/icu4c/source/test/testdata/break_rules/readme.txt b/icu4c/source/test/testdata/break_rules/readme.txt

index c1ed99cde656e3cbf4eadfc14a408fdb48308162..52d54a0f3d608a93c4a213c7623654d5de07fe87 100644 (file)
--- a/icu4c/source/test/testdata/break_rules/readme.txt
+++ b/icu4c/source/test/testdata/break_rules/readme.txt
@@ -5,23 +5,27 @@ License & terms of use: http://www.unicode.org/copyright.html#License
  Copyright (c) 2015-2016, International Business Machines Corporation and others. All Rights Reserved.
  
  This directory contains the break iterator reference rule files used by intltest rbbi/RBBIMonkeyTest/testMonkey.
-The rules in this directory track the boundary rules from Unicode UAX 14 and 29. They are interpretted
+The rules in this directory track the boundary rules from Unicode UAX 14 and 29. They are interpreted
  to provide an expected set of boundary positions to compare with the results from ICU break iteration.
  
+ICU4J also includes copies of the test reference rules, located in the directory
+main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/
+The copies should be kept synchronized; there should be no differences.
+
  Each set of reference break rules lives in a separate file.
-The list of rule files to run by default is hardcoded into the test code, in rbbimonkeytest.cpp.
+The list of rule files to run by default is hard coded into the test code, in rbbimonkeytest.cpp.
  
  Each test file includes
-  - The type of ICU break interator to create (word, line, sentence, etc.)
+  - The type of ICU break iterator to create (word, line, sentence, etc.)
    - The locale to use
    - Character Class definitions
    - Rule definitions
  
  To Do
-  - Syntax for tailoring.
+  - Extend the syntax to support rule tailoring.
  
  
-Character Class Definition: 
+Character Class Definition:
      name = set_regular_expression;
  
  Rule Definition:
@@ -35,7 +39,7 @@ set_regular_expression:
      (They are mostly the same)
      May include previously defined set names, which are logically expanded in-place.
  
-rule_regular_expresson:
+rule_regular_expression:
      An ICU Regular Expression.
      May include set names, which are logically expanded in-place.
      May include a '÷', which defines a boundary position.
@@ -52,7 +56,7 @@ Application of the rules:
                      return the position of the '÷' within the match.
                  else
                      position = last character of the rule match.
-                    break from the rule loop, continue the outer loop.
+                    break from the inner rule loop, continue the outer loop.
  
      This differs from the Unicode UAX algorithm in that each position in the text is
      not tested separately. Instead, when a rule match is found, rule application restarts with the last
@@ -66,7 +70,7 @@ Application of the rules:
      are with the Unicode UAX rules. With the main ICU break rules, all are applied in parallel.
  
  Word Dictionaries
-    The monkey test does not test dictionary based breaking. The set named 'dicitionary' is special,
+    The monkey test does not test dictionary based breaking. The set named 'dictionary' is special,
      as it is in the main ICU rules. For the monkey test, no characters from the dictionary set are
      included in the randomly-generated test data.
  
diff --git a/icu4c/source/test/testdata/break_rules/word.txt b/icu4c/source/test/testdata/break_rules/word.txt

index 783dfc9201aa7902d526bff2c470ad41cb974c0e..fd9799cec7689811a55598947153784ed9a39643 100644 (file)
--- a/icu4c/source/test/testdata/break_rules/word.txt
+++ b/icu4c/source/test/testdata/break_rules/word.txt
@@ -39,7 +39,7 @@ EmojiNRK           = [[\p{Emoji}] - [[Regional_Indicator]\u002a\u00230-9©®™
  Extended_Pict      = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
  EBG                = [\p{Word_Break = EBG}];
  
-#define dicitionary, with the effect being that those characters don't appear in test data.
+#define dictionary, with the effect being that those characters don't appear in test data.
  
  Han            = [:Han:];
  Hiragana       = [:Hiragana:];
@@ -51,12 +51,7 @@ KanaKanji      = [Han Hiragana Katakana];
  dictionaryCJK  = [KanaKanji HangulSyllable];
  dictionary     = [ComplexContext dictionaryCJK];
  
-# leave CJK scripts out of ALetterPlus
-#   Tricky. Redfine a set.
-#   For tailorings, if it modifies itself, do at end of sets ????
-#   Tweak redefine to mean replace existing definition at its original location.
-#   Insert defs without redefine just after last pre-existing def of that name.
-#   Maybe drop redefine, add warning for sets defined and not used, should catch typos.
+# leave dictionary scripts out of ALetter
  
  ALetter        = [ALetter - dictionary];
  
diff --git a/icu4c/source/test/testdata/break_rules/word_POSIX.txt b/icu4c/source/test/testdata/break_rules/word_POSIX.txt

index 232e4ddb20aee313b4084883ac61429d3e17dae9..6e8be2c7baf29d3ac1b06c85ee9f02796ef2c997 100644 (file)
--- a/icu4c/source/test/testdata/break_rules/word_POSIX.txt
+++ b/icu4c/source/test/testdata/break_rules/word_POSIX.txt
@@ -38,7 +38,7 @@ EmojiNRK           = [[\p{Emoji}] - [[Regional_Indicator]\u002a\u00230-9©®™
  Extended_Pict      = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
  EBG                = [\p{Word_Break = EBG}];
  
-#define dicitionary, with the effect being that those characters don't appear in test data.
+#define dictionary, with the effect being that those characters don't appear in test data.
  
  Han            = [:Han:];
  Hiragana       = [:Hiragana:];
@@ -50,12 +50,7 @@ KanaKanji      = [Han Hiragana Katakana];
  dictionaryCJK  = [KanaKanji HangulSyllable];
  dictionary     = [ComplexContext dictionaryCJK];
  
-# leave CJK scripts out of ALetterPlus
-#   Tricky. Redfine a set.
-#   For tailorings, if it modifies itself, do at end of sets ????
-#   Tweak redefine to mean replace existing definition at its original location.
-#   Insert defs without redefine just after last pre-existing def of that name.
-#   Maybe drop redefine, add warning for sets defined and not used, should catch typos.
+# leave dictionary scripts out of ALetter
  
  ALetter        = [ALetter - dictionary];
  
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBIMonkeyTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBIMonkeyTest.java

new file mode 100644 (file)

index 0000000..6feef97
--- /dev/null
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBIMonkeyTest.java
@@ -0,0 +1,1050 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+
+package com.ibm.icu.dev.test.rbbi;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import org.junit.Test;
+
+import com.ibm.icu.dev.test.TestFmwk;
+import com.ibm.icu.impl.UCharacterName;
+import com.ibm.icu.impl.UCharacterNameChoice;
+import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.RuleBasedBreakIterator;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.util.ULocale;
+
+/**
+ * RBBI Monkey Test. Ported from ICU4C test/intltest/rbbimonkeytest.cpp.
+ * This is the newer, data driven monkey test. It is completely separate from the
+ * older class RBBITestMonkey.
+ */
+
+public class RBBIMonkeyTest extends TestFmwk {
+
+
+    //  class CharClass    Represents a single character class from the source break rules.
+    //                     Inherits from UObject because instances are adopted by UHashtable, which ultimately
+    //                     deletes them using hash's object deleter function.
+
+    static class CharClass  {
+        String         fName;
+        String         fOriginalDef;    // set definition as it appeared in user supplied rules.
+        String         fExpandedDef;    // set definition with any embedded named sets replaced by their defs, recursively.
+        UnicodeSet     fSet;
+        CharClass(String name, String originalDef, String expandedDef, UnicodeSet set) {
+            fName = name;
+            fOriginalDef = originalDef;
+            fExpandedDef = expandedDef;
+            fSet = set;
+        };
+    }
+
+
+    // class BreakRule    Struct-like class represents a single rule from a set of break rules.
+    //                    Each rule has the set definitions expanded, and
+    //                    is compiled to a regular expression.
+
+    static class BreakRule {
+        String    fName;                   // Name of the rule.
+        String    fRule;                   // Rule expression, excluding the name, as written in user source.
+        String    fExpandedRule;           // Rule expression after expanding the set definitions.
+        Matcher   fRuleMatcher;            // Regular expression that matches the rule.
+    };
+
+
+    // class BreakRules    represents a complete set of break rules, possibly tailored,
+    //                     compiled from testdata break rules.
+
+    static class BreakRules {
+        BreakRules(RBBIMonkeyImpl monkeyImpl) {
+            fMonkeyImpl = monkeyImpl;
+            fBreakRules = new ArrayList<BreakRule>();
+            fType = BreakIterator.KIND_TITLE;
+            fCharClasses = new HashMap<String, CharClass>();
+            fCharClassList = new ArrayList<CharClass>();
+            fDictionarySet = new UnicodeSet();
+
+            // Match an alpha-numeric identifier in a rule. Will be a set name.
+            // Use negative look-behind to exclude non-identifiers, mostly property names or values.
+            fSetRefsMatcher = Pattern.compile(
+                    "(?<!\\{[ \\t]{0,4})" +
+                    "(?<!=[ \\t]{0,4})" +
+                    "(?<!\\[:[ \\t]{0,4})" +
+                    "(?<!\\\\)" +
+                    "(?<![A-Za-z0-9_])" +
+                    "([A-Za-z_][A-Za-z0-9_]*)").     // The char class name
+                    matcher("");
+
+            // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules.
+            fCommentsMatcher = Pattern.compile("" +
+                    "(^|(?<=;))"   +                // Start either at start of line, or just after a ';' (look-behind for ';')
+                    "[ \\t]*+"     +                //   Match white space.
+                    "(#.*)?+"      +                //   Optional # plus whatever follows
+                    "$").                           //   new-line at end of line.
+                    matcher("");
+
+            // Match (initial parse) of a character class definition line.
+            fClassDefMatcher = Pattern.compile("" +
+                    "[ \\t]*"           +                    // leading white space
+                    "([A-Za-z_][A-Za-z0-9_]*)" +             // The char class name
+                    "[ \\t]*=[ \\t]*"   +                    //   =
+                    "(.*?)"  +                               // The char class UnicodeSet expression
+                    "[ \\t]*;$").                            // ; <end of line>
+                    matcher("");
+
+            // Match (initial parse) of a break rule line.
+            fRuleDefMatcher = Pattern.compile("" +
+                    "[ \\t]*"           +                     // leading white space
+                    "([A-Za-z_][A-Za-z0-9_.]*)" +             // The rule name
+                    "[ \\t]*:[ \\t]*"   +                     //   :
+                    "(.*?)"   +                               // The rule definition
+                    "[ \\t]*;$").                             // ; <end of line>
+                    matcher("");
+
+            // Match a property expression, either [:xxx:] or \p{...}
+            fPropertyMatcher = Pattern.compile("" +
+                    "\\[:.*?:]|\\\\(?:p|P)\\{.*?\\}").
+                    matcher("");
+
+
+        }
+
+        /**
+         * Create the expanded definition for this char class,
+         * replacing any set references with the corresponding definition.
+         */
+        CharClass  addCharClass(String name, String definition) {
+            StringBuffer expandedDef = new StringBuffer();
+            fSetRefsMatcher.reset(definition);
+            while (fSetRefsMatcher.find()) {
+                String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1);
+                CharClass snameClass = fCharClasses.get(sname);
+                String expansionForName = snameClass != null ? snameClass.fExpandedDef : sname;
+
+                fSetRefsMatcher.appendReplacement(expandedDef, "");
+                expandedDef.append(expansionForName);
+            }
+            fSetRefsMatcher.appendTail(expandedDef);
+            String expandedDefString = expandedDef.toString();
+
+            if (fMonkeyImpl.fDumpExpansions) {
+                System.out.printf("addCharClass(\"%s\"\n", name);
+                System.out.printf("             %s\n", definition);
+                System.out.printf("expandedDef: %s\n", expandedDefString);
+            }
+
+            // Verify that the expanded set definition is valid.
+
+            UnicodeSet s;
+            try {
+                s = new UnicodeSet(expandedDefString, UnicodeSet.IGNORE_SPACE);
+            } catch (java.lang.IllegalArgumentException e) {
+                System.err.printf("%s: error %s creating UnicodeSet %s", fMonkeyImpl.fRuleFileName, e.toString(), name);
+                throw e;
+            }
+
+            // Get an expanded equivalent pattern from the UnicodeSet.
+            // This removes set difference operators, which would fail if passed through to Java regex.
+
+            StringBuffer expandedPattern = new StringBuffer();
+            s._generatePattern(expandedPattern, true);
+            expandedDefString = expandedPattern.toString();
+            if (fMonkeyImpl.fDumpExpansions) {
+                System.out.printf("expandedDef2: %s\n", expandedDefString);
+            }
+
+            CharClass cclass = new CharClass(name, definition, expandedDefString, s);
+            CharClass previousClass = fCharClasses.put(name, cclass);
+
+            if (previousClass != null) {
+                // TODO: decide whether or not to allow redefinitions.
+                //       Can be convenient in some cases.
+                // String msg = String.format("%s: Redefinition of character class %s\n",
+                //         fMonkeyImpl.fRuleFileName, cclass.fName);
+                // System.err.println(msg);
+                // throw new IllegalArgumentException(msg);
+            }
+            return cclass;
+
+        };
+
+
+        void addRule(String  name, String  definition) {
+            BreakRule  thisRule = new BreakRule();
+            StringBuffer expandedDefsRule = new StringBuffer();
+            thisRule.fName = name;
+            thisRule.fRule = definition;
+
+            // Expand the char class definitions within the rule.
+            fSetRefsMatcher.reset(definition);
+            while (fSetRefsMatcher.find()) {
+                String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1);
+                CharClass nameClass = fCharClasses.get(sname);
+                if (nameClass == null) {
+                    System.err.printf("char class \"%s\" unrecognized in rule \"%s\"\n", sname, definition);
+                }
+                String expansionForName = nameClass != null ? nameClass.fExpandedDef : sname;
+                fSetRefsMatcher.appendReplacement(expandedDefsRule, "");
+                expandedDefsRule.append(expansionForName);
+            }
+            fSetRefsMatcher.appendTail(expandedDefsRule);
+
+            // Replace any property expressions, \p{...} or [:...:] with an equivalent expansion,
+            // obtained from ICU UnicodeSet. Need to do this substitution because Java regex
+            // does not recognize all properties, and because Java's definitions are likely
+            // older than ICU's.
+
+            StringBuffer expandedRule = new StringBuffer();
+            fPropertyMatcher.reset(expandedDefsRule);
+            while (fPropertyMatcher.find()) {
+                String prop = fPropertyMatcher.group();
+                UnicodeSet propSet = new UnicodeSet("[" + prop + "]");
+                StringBuffer propExpansion = new StringBuffer();
+                propSet._generatePattern(propExpansion, true);
+                fPropertyMatcher.appendReplacement(expandedRule, propExpansion.toString());
+            }
+            fPropertyMatcher.appendTail(expandedRule);
+
+            //   Replace any [^negated sets] with equivalent flattened sets generated by
+            //   ICU UnicodeSet. [^ ...] in Java Regex character classes does not apply
+            //   to any nested classes. Variable substitution in rules produces
+            //   nested sets that [^negation] needs to apply to.
+
+            StringBuffer ruleWithFlattenedSets = new StringBuffer();
+            int idx = 0;
+            while (idx<expandedRule.length()) {
+                int setOpenPos = expandedRule.indexOf("[^", idx);
+                if (setOpenPos < 0) {
+                    break;
+                }
+                if (setOpenPos > idx) {
+                    // Move anything from the source rule preceding the [^ into the processed rule, unchanged.
+                    ruleWithFlattenedSets.append(expandedRule.substring(idx,  setOpenPos));
+                }
+                int nestingLevel = 1;
+                boolean haveNesting = false;
+                int setClosePos;
+                for (setClosePos = setOpenPos + 2; nestingLevel > 0 && setClosePos<expandedRule.length(); ++setClosePos) {
+                    char c = expandedRule.charAt(setClosePos);
+                    if (c == '\\') {
+                        ++setClosePos;
+                    } else if (c == '[') {
+                        ++nestingLevel;
+                        haveNesting = true;
+                    } else if (c == ']') {
+                        --nestingLevel;
+                    }
+                }
+                if (haveNesting && nestingLevel == 0) {
+                    // Found one, a negated set that includes interior nested sets.
+                    // Create an ICU UnicodeSet from the source pattern, and obtain an
+                    // equivalent flattened pattern from that.
+                    UnicodeSet uset = new UnicodeSet(expandedRule.substring(setOpenPos, setClosePos), true);
+                    uset._generatePattern(ruleWithFlattenedSets, true);
+                } else {
+                    // The [^ set definition did not include any nested sets.
+                    // Copy the original definition without change.
+                    // Java regular expressions will handle it without needing to recast it.
+                    if (nestingLevel > 0) {
+                        // Error case of an unclosed character class expression.
+                        // Java regex will also eventually flag the error.
+                        System.err.printf("No closing ] found in rule %s\n", name);
+                    }
+                    ruleWithFlattenedSets.append(expandedRule.substring(setOpenPos, setClosePos));
+                }
+                idx = setClosePos;
+            }
+
+            if (idx < expandedRule.length()) {
+                ruleWithFlattenedSets.append(expandedRule.substring(idx, expandedRule.length()));
+            }
+
+            thisRule.fExpandedRule = ruleWithFlattenedSets.toString();
+
+            // Replace the divide sign (\u00f7) with a regular expression named capture.
+            // When running the rules, a match that includes this group means we found a break position.
+
+            // thisRule.fExpandedRule = thisRule.fExpandedRule.replace("÷", "(?<BreakPosition>)");
+            thisRule.fExpandedRule = thisRule.fExpandedRule.replace("÷", "()");
+            if (thisRule.fExpandedRule.indexOf("÷") != -1) {
+                String msg = String.format("%s Rule %s contains multiple ÷ signs", fMonkeyImpl.fRuleFileName, name);
+                System.err.println(msg);
+                throw new IllegalArgumentException(msg);
+            }
+
+            // UAX break rule set definitions can be empty, just [].
+            // Regular expression set expressions don't accept this. Substitute with [a&&[^a]], which
+            // also matches nothing.
+
+            thisRule.fExpandedRule = thisRule.fExpandedRule.replace("[]", "[a&&[^a]]");
+
+            // Change Unicode escape syntax for compatibility with Java regular expressions (Java 7 or newer)
+            //    \udddd     => \x{dddd}
+            //    \U00hhhhhh => \x{hhhhhh}
+
+            // thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\u([0-9A-Fa-f]{4})", "\\\\x{$1}");
+            // thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\U00([0-9A-Fa-f]{6})", "\\\\x{$1}");
+
+            // Java 6 compatibility troubles - there is no syntax for escaping a supplementary character
+            // within a regular expression character class. Put them in as unescaped literal chars.
+            StringBuilder sb = new StringBuilder(thisRule.fExpandedRule);
+            while (true) {
+                int where = sb.indexOf("\\U00");
+                if (where < 0) {
+                    break;
+                }
+                String cp = hexToCodePoint(sb.substring(where+2, where+10));
+                sb.replace(where, where+10, cp);
+            }
+            thisRule.fExpandedRule = sb.toString();
+
+            // Escape any literal '#' in the rule expression. Without escaping, these introduce a comment.
+            // UnicodeSet._generatePattern() inserts un-escaped "#"s
+
+            thisRule.fExpandedRule = thisRule.fExpandedRule.replace("#", "\\#");
+            if (fMonkeyImpl.fDumpExpansions) {
+                System.out.printf("fExpandedRule: %s\n", thisRule.fExpandedRule);
+            }
+
+            // Compile a regular expression for this rule.
+
+            try {
+                thisRule.fRuleMatcher = Pattern.compile(thisRule.fExpandedRule, Pattern.COMMENTS | Pattern.DOTALL).matcher("");
+            } catch (PatternSyntaxException e) {
+                System.err.printf("%s: Error creating regular expression for rule %s. Expansion is \n\"%s\"",
+                        fMonkeyImpl.fRuleFileName, name, thisRule.fExpandedRule);
+                throw e;
+            }
+
+            // Put this new rule into the vector of all Rules.
+
+            fBreakRules.add(thisRule);
+        };
+
+        private static String hexToCodePoint(String hex) {
+            int cp = Integer.parseInt(hex, 16);
+            return new StringBuilder().appendCodePoint(cp).toString();
+        }
+
+
+        boolean setKeywordParameter(String keyword, String value) {
+            if (keyword.equals("locale")) {
+                fLocale = new ULocale(value);
+                return true;
+            }
+            if (keyword.equals("type")) {
+                if (value.equals("grapheme")) {
+                    fType = BreakIterator.KIND_CHARACTER;
+                } else if (value.equals("word")) {
+                    fType = BreakIterator.KIND_WORD;
+                } else if (value.equals("line")) {
+                    fType = BreakIterator.KIND_LINE;
+                } else if (value.equals("sentence")) {
+                    fType = BreakIterator.KIND_SENTENCE;
+                } else {
+                    String msg = String.format("%s: Unrecognized break type %s", fMonkeyImpl.fRuleFileName, value);
+                    System.err.println(msg);
+                    throw new IllegalArgumentException(msg);
+                }
+                return true;
+            }
+            return false;
+        }
+
+
+        RuleBasedBreakIterator createICUBreakIterator() {
+            BreakIterator bi;
+            switch(fType) {
+                case BreakIterator.KIND_CHARACTER:
+                    bi = (BreakIterator.getCharacterInstance(fLocale));
+                    break;
+                case BreakIterator.KIND_WORD:
+                    bi = (BreakIterator.getWordInstance(fLocale));
+                    break;
+                case BreakIterator.KIND_LINE:
+                    bi = (BreakIterator.getLineInstance(fLocale));
+                    break;
+                case BreakIterator.KIND_SENTENCE:
+                    bi = (BreakIterator.getSentenceInstance(fLocale));
+                    break;
+                default:
+                    String msg = String.format("%s: Bad break iterator type of %d", fMonkeyImpl.fRuleFileName, fType);
+                    System.err.println(msg);
+                    throw new IllegalArgumentException(msg);
+            }
+            return (RuleBasedBreakIterator)bi;
+
+        };
+
+
+
+        void compileRules(String rules) {
+            int lineNumber = 0;
+            for (String line: rules.split("\\r?\\n")) {
+                ++lineNumber;
+                // Strip comment lines.
+                fCommentsMatcher.reset(line);
+                line = fCommentsMatcher.replaceFirst("");
+                if (line.isEmpty()) {
+                    continue;
+                }
+
+                // Recognize character class definition and keyword lines
+                fClassDefMatcher.reset(line);
+                if (fClassDefMatcher.matches()) {
+                    String className = fClassDefMatcher.group(/*"ClassName"*/ 1);
+                    String classDef  = fClassDefMatcher.group(/*"ClassDef"*/ 2);
+                    if (fMonkeyImpl.fDumpExpansions) {
+                        System.out.printf("scanned class: %s = %s\n", className, classDef);
+                    }
+                    if (setKeywordParameter(className, classDef)) {
+                        // The scanned item was "type = ..." or "locale = ...", etc.
+                        //   which are not actual character classes.
+                        continue;
+                    }
+                    addCharClass(className, classDef);
+                    continue;
+                }
+
+                // Recognize rule lines.
+                fRuleDefMatcher.reset(line);
+                if (fRuleDefMatcher.matches()) {
+                    String ruleName = fRuleDefMatcher.group(/*"RuleName"*/ 1);
+                    String ruleDef  = fRuleDefMatcher.group(/*"RuleDef"*/ 2);
+                    if (fMonkeyImpl.fDumpExpansions) {
+                        System.out.printf("scanned rule: %s : %s\n", ruleName, ruleDef);
+                    }
+                    addRule(ruleName, ruleDef);
+                    continue;
+                }
+
+                String msg = String.format("Unrecognized line in rule file %s:%d \"%s\"",
+                        fMonkeyImpl.fRuleFileName, lineNumber, line);
+                System.err.println(msg);
+                throw new IllegalArgumentException(msg);
+            }
+
+            // Build the vector of char classes, omitting the dictionary class if there is one.
+            // This will be used when constructing the random text to be tested.
+
+            // Also compute the "other" set, consisting of any characters not included in
+            // one or more of the user defined sets.
+
+            UnicodeSet otherSet = new UnicodeSet(0, 0x10ffff);
+
+            for (Map.Entry<String, CharClass> el: fCharClasses.entrySet()) {
+                String ccName = el.getKey();
+                CharClass cclass = el.getValue();
+
+                // System.out.printf("    Adding %s\n", ccName);
+                if (!ccName.equals(cclass.fName)) {
+                    throw new IllegalArgumentException(
+                            String.format("%s: internal error, set names (%s, %s) inconsistent.\n",
+                                    fMonkeyImpl.fRuleFileName, ccName, cclass.fName));
+                }
+                otherSet.removeAll(cclass.fSet);
+                if (ccName.equals("dictionary")) {
+                    fDictionarySet = cclass.fSet;
+                } else {
+                    fCharClassList.add(cclass);
+                }
+            }
+
+            if (!otherSet.isEmpty()) {
+                // System.out.printf("have an other set.\n");
+                CharClass cclass = addCharClass("__Others", otherSet.toPattern(true));
+                fCharClassList.add(cclass);
+            }
+
+        };
+
+        CharClass getClassForChar(int c) {
+            for (CharClass cc: fCharClassList) {
+                if (cc.fSet.contains(c)) {
+                    return cc;
+                }
+            }
+            return null;
+        };
+
+
+        RBBIMonkeyImpl          fMonkeyImpl;        // Pointer back to the owning MonkeyImpl instance.
+        List<BreakRule>         fBreakRules;        // Contents are of type (BreakRule *).
+
+        Map<String, CharClass>  fCharClasses;       // Key is the set name.
+        //                                          // Value is the corresponding CharClass
+        List<CharClass>         fCharClassList;     // Char Classes, same contents as fCharClasses values,
+
+        UnicodeSet              fDictionarySet;     // Dictionary set, empty if none is defined.
+        ULocale                 fLocale;
+        int                     fType;              // BreakItererator.KIND_WORD, etc.
+
+
+        Matcher fSetRefsMatcher;
+        Matcher fCommentsMatcher;
+        Matcher fClassDefMatcher;
+        Matcher fRuleDefMatcher;
+        Matcher fPropertyMatcher;
+    };
+
+
+
+
+    // class MonkeyTestData    represents a randomly synthesized test data string together
+    //                         with the expected break positions obtained by applying
+    //                         the test break rules.
+
+    static class MonkeyTestData{
+
+        void set(BreakRules rules, ICU_Rand rand) {
+            int dataLength = 1000;   // length of test data to generate, in code points.
+
+            // Fill the test string with random characters.
+            // First randomly pick a char class, then randomly pick a character from that class.
+            // Exclude any characters from the dictionary set.
+
+            // System.out.println("Populating Test Data");
+            fRandomSeed = rand.getSeed();         // Save initial seed for use in error messages,
+                                                  // allowing recreation of failing data.
+            fBkRules = rules;
+            StringBuilder newString = new StringBuilder();
+            for (int n=0; n<dataLength;) {
+                int charClassIndex = rand.next() % rules.fCharClassList.size();
+                CharClass cclass = rules.fCharClassList.get(charClassIndex);
+                if (cclass.fSet.size() == 0) {
+                    // Some rules or tailorings do end up with empty char classes.
+                    continue;
+                }
+                int charIndex = rand.next() % cclass.fSet.size();
+                int c = cclass.fSet.charAt(charIndex);
+                if (/*Character.isBmpCodePoint(c)*/ c<=0x0ffff && Character.isLowSurrogate((char)c) &&
+                        newString.length() > 0 && Character.isHighSurrogate(newString.charAt(newString.length()-1))) {
+                    // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control.
+                    // Don't let random unpaired surrogates combine in the test data because they might
+                    // produce an unwanted dictionary character.
+                    continue;
+                }
+
+                if (!rules.fDictionarySet.contains(c)) {
+                    newString.appendCodePoint(c);
+                    ++n;
+                }
+            }
+            fString = newString.toString();
+
+            // Init the expectedBreaks, actualBreaks and ruleForPosition.
+            // Expected and Actual breaks are one longer than the input string; a true value
+            // will indicate a boundary preceding that position.
+
+            fActualBreaks    = new boolean[fString.length()+1];
+            fExpectedBreaks  = new boolean[fString.length()+1];
+            fRuleForPosition = new int[fString.length()+1];
+            f2ndRuleForPos   = new int[fString.length()+1];
+
+            // Apply reference rules to find the expected breaks.
+
+            fExpectedBreaks[0] = true;       // Force an expected break before the start of the text.
+                                             // ICU always reports a break there.
+                                             // The reference rules do not have a means to do so.
+            int strIdx = 0;
+            while (strIdx < fString.length()) {
+                BreakRule matchingRule = null;
+                boolean hasBreak = false;
+                int ruleNum = 0;
+                int matchStart = 0;
+                int matchEnd = 0;
+                for (ruleNum=0; ruleNum<rules.fBreakRules.size(); ruleNum++) {
+                    BreakRule rule = rules.fBreakRules.get(ruleNum);
+                    rule.fRuleMatcher.reset(fString.substring(strIdx));
+                    if (rule.fRuleMatcher.lookingAt()) {
+                        // A candidate rule match, check further to see if we take it or continue to check other rules.
+                        // Matches of zero or one code point count only if they also specify a break.
+                        matchStart = strIdx;
+                        matchEnd = strIdx + rule.fRuleMatcher.end();
+                        hasBreak = BreakGroupStart(rule.fRuleMatcher) >= 0;
+                        if (hasBreak ||
+                                (matchStart < fString.length() && fString.offsetByCodePoints(matchStart, 1) < matchEnd)) {
+                            matchingRule = rule;
+                            break;
+                        }
+                    }
+                }
+                if (matchingRule == null) {
+                    // No reference rule matched. This is an error in the rules that should never happen.
+                    String msg = String.format("%s: No reference rules matched at position %d. ",
+                            rules.fMonkeyImpl.fRuleFileName, strIdx);
+                    System.err.println(msg);
+                    dump(strIdx);
+                    throw new IllegalArgumentException(msg);
+                }
+                if (matchingRule.fRuleMatcher.group().length() == 0) {
+                    // Zero length rule match. This is also an error in the rule expressions.
+                    String msg = String.format("%s:%s: Zero length rule match at %d.",
+                            rules.fMonkeyImpl.fRuleFileName, matchingRule.fName, strIdx);
+                    System.err.println(msg);
+                    dump(strIdx);
+                    throw new IllegalArgumentException(msg);
+                }
+
+                // Record which rule matched over the length of the match.
+                for (int i = matchStart; i < matchEnd; i++) {
+                    if (fRuleForPosition[i] == 0) {
+                        fRuleForPosition[i] = ruleNum;
+                    } else {
+                        f2ndRuleForPos[i] = ruleNum;
+                    }
+                }
+
+                // Break positions appear in rules as a matching named capture of zero length at the break position,
+                //   the adjusted pattern contains (?<BreakPosition>)
+                if (hasBreak) {
+                    int breakPos = strIdx + BreakGroupStart(matchingRule.fRuleMatcher);
+                    fExpectedBreaks[breakPos] = true;
+                    // System.out.printf("recording break at %d\n", breakPos);
+                    // For the next iteration, pick up applying rules immediately after the break,
+                    // which may differ from end of the match. The matching rule may have included
+                    // context following the boundary that needs to be looked at again.
+                    strIdx = breakPos;
+                } else {
+                    // Original rule didn't specify a break.
+                    // Continue applying rules starting on the last code point of this match.
+                    int updatedStrIdx = fString.offsetByCodePoints(matchEnd, -1);
+                    if (updatedStrIdx == matchStart) {
+                        // Match was only one code point, no progress if we continue.
+                        // Shouldn't get here, case is filtered out at top of loop.
+                        throw new IllegalArgumentException(String.format("%s: Rule %s internal error.",
+                                rules.fMonkeyImpl.fRuleFileName, matchingRule.fName));
+                    }
+                    strIdx = updatedStrIdx;
+                }
+            }
+        };
+
+        // Helper function to find the starting index of a match of the "BreakPosition" named capture group.
+        // @param m: a Java regex Matcher that has completed a matching operation.
+        // @return m.start("BreakPosition),
+        //         or -1 if there is no such group, or the group did not participate in the match.
+        //
+        // TODO: this becomes m.start("BreakPosition") with Java 8.
+        //       In the mean time, assume that the only zero-length capturing group in
+        //       a reference rule expression is the "BreakPosition" that corresponds to a "÷".
+
+        static int BreakGroupStart(Matcher m) {
+            for (int groupNum=1; groupNum <= m.groupCount(); ++groupNum) {
+                String group = m.group(groupNum);
+                if (group == null) {
+                    continue;
+                }
+                if (group.equals("")) {
+                    // assert(m.end(groupNum) == m.end("BreakPosition"));
+                    return m.start(groupNum);
+                }
+            }
+            return -1;
+        }
+
+        void dump(int around) {
+            System.out.print("\n"
+                    +        "         char                        break  Rule                     Character\n"
+                    +        "   pos   code   class                 R I   name                     name\n"
+                    +        "---------------------------------------------------------------------------------------------\n");
+
+            int start;
+            int end;
+
+            if (around == -1) {
+                start = 0;
+                end = fString.length();
+            } else {
+                // Display context around a failure.
+                try {
+                    start = fString.offsetByCodePoints(around, -30);
+                } catch (Exception e) {
+                    start = 0;
+                }
+                try {
+                    end = fString.offsetByCodePoints(around, +30);
+                } catch (Exception e) {
+                    end = fString.length();
+                }
+            }
+
+            for (int charIdx = start; charIdx < end; charIdx=fString.offsetByCodePoints(charIdx, 1)) {
+                int c = fString.codePointAt(charIdx);
+                CharClass cc = fBkRules.getClassForChar(c);
+
+                BreakRule rule = fBkRules.fBreakRules.get(fRuleForPosition[charIdx]);
+                String secondRuleName = "";
+                if (f2ndRuleForPos[charIdx] > 0) {
+                    secondRuleName = fBkRules.fBreakRules.get(f2ndRuleForPos[charIdx]).fName;
+                }
+                String cName = UCharacterName.INSTANCE.getName(c, UCharacterNameChoice.EXTENDED_CHAR_NAME);
+
+                System.out.printf("  %4d %6x   %-20s  %c %c   %-10s %-10s    %s\n",
+                        charIdx, c, cc.fName,
+                        fExpectedBreaks[charIdx] ? '*' : '.',
+                        fActualBreaks[charIdx] ? '*' : '.',
+                        rule.fName, secondRuleName, cName
+                        );
+                }
+
+        };
+
+        void clearActualBreaks() {
+            Arrays.fill(fActualBreaks, false);
+        }
+
+
+        int               fRandomSeed;        // The initial seed value from the random number generator.
+        BreakRules        fBkRules;           // The break rules used to generate this data.
+        String            fString;            // The text.
+        boolean           fExpectedBreaks[];  // Breaks as found by the reference rules.
+                                              //     Parallel to fString. true if break preceding.
+        boolean           fActualBreaks[];    // Breaks as found by ICU break iterator.
+        int               fRuleForPosition[]; // Index into BreakRules.fBreakRules of rule that applied at each position.
+                                              // Also parallel to fString.
+        int               f2ndRuleForPos[];   // As above. A 2nd rule applies when the preceding rule
+                                              //   didn't cause a break, and a subsequent rule match starts
+                                              //   on the last code point of the preceding match.
+
+    }
+
+
+    // class RBBIMonkeyImpl     holds (some indirectly) everything associated with running a monkey
+    //                          test for one set of break rules.
+    //
+
+    static class RBBIMonkeyImpl extends Thread {
+
+        void setup(String ruleFile) {
+            fRuleFileName = ruleFile;
+            openBreakRules(ruleFile);
+            fRuleSet = new BreakRules(this);
+            fRuleSet.compileRules(fRuleCharBuffer);
+            fBI = fRuleSet.createICUBreakIterator();
+            fTestData = new MonkeyTestData();
+        };
+
+        void openBreakRules(String fileName) {
+            StringBuilder testFileBuf = new StringBuilder();
+            InputStream is = null;
+            String filePath = "break_rules/" + fileName;
+            try {
+                is = RBBIMonkeyImpl.class.getResourceAsStream(filePath);
+                if (is == null) {
+                    errln("Could not open test data file " + fileName);
+                    return;
+                }
+                InputStreamReader isr = new InputStreamReader(is, "UTF-8");
+                try {
+                    int c;
+                    int count = 0;
+                    for (;;) {
+                        c = isr.read();
+                        if (c < 0) {
+                            break;
+                        }
+                        count++;
+                        if (c == 0xFEFF && count == 1) {
+                            // BOM in the test data file. Discard it.
+                            continue;
+                        }
+                       testFileBuf.appendCodePoint(c);
+                    }
+                } finally {
+                    isr.close();
+                }
+                } catch (IOException e) {
+                try {
+                    is.close();
+                } catch (IOException ignored) {
+                }
+                errln(e.toString());
+            }
+            fRuleCharBuffer =  testFileBuf.toString();  /* the file as a String */
+        }
+
+        class MonkeyException extends RuntimeException  {
+            private static final long serialVersionUID = 1L;
+            public int fPosition;    // Position of the failure in the test data.
+            MonkeyException(String description, int pos) {
+                super(description);
+                fPosition = pos;
+            }
+        }
+
+        @Override
+        public void run() {
+            int errorCount = 0;
+            if (fBI == null) {
+                fErrorMsgs.append("Unable to run test because fBI is null.\n");
+                return;
+            }
+            for (long loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) {
+                try {
+                    fTestData.set(fRuleSet, fRandomGenerator);
+                    // fTestData.dump(-1);
+                    testForwards();
+                    testPrevious();
+                    testFollowing();
+                    testPreceding();
+                    testIsBoundary();
+                } catch (MonkeyException e) {
+                    String formattedMsg = String.format(
+                            "%s at index %d. VM Arguments to reproduce: -Drules=%s -Dseed=%d -Dloop=1 -Dverbose=1 \"\n",
+                            e.getMessage(), e.fPosition, fRuleFileName, fTestData.fRandomSeed);
+                    System.err.print(formattedMsg);
+                    if (fVerbose) {
+                        fTestData.dump(e.fPosition);
+                    }
+                    fErrorMsgs.append(formattedMsg);
+                    if (++errorCount > 10) {
+                        return;
+                    }
+                }
+                if (fLoopCount < 0 && loopCount % 100 == 0) {
+                    System.err.print(".");
+                }
+            }
+        }
+
+        enum CheckDirection {
+            FORWARD,
+            REVERSE
+        };
+
+        void testForwards() {
+            fTestData.clearActualBreaks();
+            fBI.setText(fTestData.fString);
+            int previousBreak = -2;
+            for (int bk=fBI.first(); bk != BreakIterator.DONE; bk=fBI.next()) {
+                if (bk <= previousBreak) {
+                    throw new MonkeyException("Break Iterator Stall", bk);
+                }
+                if (bk < 0 || bk > fTestData.fString.length()) {
+                    throw new MonkeyException("Boundary out of bounds", bk);
+                }
+                fTestData.fActualBreaks[bk] = true;
+            }
+            checkResults("testForwards", CheckDirection.FORWARD);
+        };
+
+
+       void testFollowing() {
+           fTestData.clearActualBreaks();
+           fBI.setText(fTestData.fString);
+           int nextBreak = -1;
+           for (int i=-1 ; i<fTestData.fString.length(); ++i) {
+               int bk = fBI.following(i);
+               if (bk == BreakIterator.DONE && i == fTestData.fString.length()) {
+                   continue;
+               }
+               if (bk == nextBreak && bk > i) {
+                   // i is in the gap between two breaks.
+                   continue;
+               }
+               if (i == nextBreak && bk > nextBreak) {
+                   fTestData.fActualBreaks[bk] = true;
+                   nextBreak = bk;
+                   continue;
+               }
+               throw new MonkeyException("following(i)", i);
+           }
+           checkResults("testFollowing", CheckDirection.FORWARD);
+        };
+
+
+        void testPrevious() {
+            fTestData.clearActualBreaks();
+            fBI.setText(fTestData.fString);
+            int previousBreak = Integer.MAX_VALUE;
+            for (int bk=fBI.last(); bk != BreakIterator.DONE; bk=fBI.previous()) {
+                 if (bk >= previousBreak) {
+                     throw new MonkeyException("Break Iterator Stall", bk);
+                }
+                if (bk < 0 || bk > fTestData.fString.length()) {
+                    throw new MonkeyException("Boundary out of bounds", bk);
+                }
+                fTestData.fActualBreaks[bk] = true;
+            }
+            checkResults("testPrevius", CheckDirection.REVERSE);
+        };
+
+
+        /**
+         * Given an index into a string, if it refers to the trail surrogate of a surrogate pair,
+         * adjust it to point to the lead surrogate, which is the start of the code point.
+         * @param s the String.
+         * @param i the initial index
+         * @return the adjusted index
+         */
+        private int getChar32Start(String s, int i) {
+            if (i > 0 && i < s.length() &&
+                    Character.isLowSurrogate(s.charAt(i)) && Character.isHighSurrogate(s.charAt(i-1))) {
+                --i;
+            }
+            return i;
+        }
+
+
+        void testPreceding() {
+            fTestData.clearActualBreaks();
+            fBI.setText(fTestData.fString);
+            int nextBreak = fTestData.fString.length()+1;
+            for (int i=fTestData.fString.length()+1 ; i>=0; --i) {
+                int bk = fBI.preceding(i);
+                // System.err.printf("testPreceding() i:%d  bk:%d  nextBreak:%d\n", i, bk, nextBreak);
+                if (bk == BreakIterator.DONE && i == 0) {
+                    continue;
+                }
+                if (bk == nextBreak && bk < i) {
+                    // i is in the gap between two breaks.
+                    continue;
+                }
+                if (i<fTestData.fString.length() && getChar32Start(fTestData.fString, i) < i) {
+                    // i indexes to a trailing surrogate.
+                    // Break Iterators treat an index to either half as referring to the supplemental code point,
+                    // with preceding going to some preceding code point.
+                    if (fBI.preceding(i) != fBI.preceding(getChar32Start(fTestData.fString, i))) {
+                        throw new MonkeyException("preceding of trailing surrogate error", i);
+                    }
+                    continue;
+                }
+                if (i == nextBreak && bk < nextBreak) {
+                    fTestData.fActualBreaks[bk] = true;
+                    nextBreak = bk;
+                    continue;
+                }
+                throw new MonkeyException("preceding(i)", i);
+            }
+            checkResults("testPreceding", CheckDirection.REVERSE);
+
+        };
+
+
+        void testIsBoundary() {
+            fTestData.clearActualBreaks();
+            fBI.setText(fTestData.fString);
+            for (int i=fTestData.fString.length(); i>=0; --i) {
+                if (fBI.isBoundary(i)) {
+                    fTestData.fActualBreaks[i] = true;
+                }
+            }
+            checkResults("testForwards", CheckDirection.FORWARD);
+        };
+
+
+        void checkResults(String msg, CheckDirection direction) {
+            if (direction == CheckDirection.FORWARD) {
+                for (int i=0; i<=fTestData.fString.length(); ++i) {
+                    if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) {
+                        throw new MonkeyException(msg, i);
+                    }
+                }
+            } else {
+                for (int i=fTestData.fString.length(); i>=0; i--) {
+                    if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) {
+                        throw new MonkeyException(msg, i);
+                    }
+                }
+            }
+
+        };
+
+        String                 fRuleCharBuffer;         // source file contents of the reference rules.
+        BreakRules             fRuleSet;
+        RuleBasedBreakIterator fBI;
+        MonkeyTestData         fTestData;
+        ICU_Rand               fRandomGenerator;
+        String                 fRuleFileName;
+        boolean                fVerbose;                 // True to do long dump of failing data.
+        int                    fLoopCount;
+        int                    fErrorCount;
+
+        boolean                fDumpExpansions;          // Debug flag to output expanded form of rules and sets.
+        StringBuilder          fErrorMsgs = new StringBuilder();
+
+    }
+
+    //  Test parameters, specified via Java properties.
+    //
+    //  rules=file_name   Name of file containing the reference rules.
+    //  seed=nnnnn        Random number starting seed.
+    //                    Setting the seed allows errors to be reproduced.
+    //  loop=nnn          Looping count.  Controls running time.
+    //                    -1:  run forever.
+    //                     0 or greater:  run length.
+    //  expansions        debug option, show expansions of rules and sets.
+    //  verbose           Display details of the failure.
+    //
+    // Parameters are passed to the JVM on the command line, or
+    // via the Eclipse Run Configuration settings, arguments tab, VM parameters.
+    // For example,
+    //      -ea -Drules=line.txt -Dloop=-1
+    //
+    @Test
+    public void TestMonkey() {
+        String tests[] = {"grapheme.txt", "word.txt", "line.txt", "sentence.txt", "line_normal.txt",
+                "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt"
+        };
+
+        String testNameFromParams = getProperty("rules");
+
+        if (testNameFromParams != null) {
+            tests = new String[] {testNameFromParams};
+        }
+
+        int loopCount = getIntProperty("loop", isQuick() ? 100 : 5000);
+        boolean dumpExpansions =  getBooleanProperty("expansions", false);
+        boolean verbose = getBooleanProperty("verbose", false);
+        int seed = getIntProperty("seed", 1);
+
+        List<RBBIMonkeyImpl> startedTests = new ArrayList<RBBIMonkeyImpl>();
+
+        // Monkey testing is multi-threaded.
+        // Each set of break rules to be tested is run in a separate thread.
+        // Each thread/set of rules gets a separate RBBIMonkeyImpl object.
+
+        for (String testName: tests) {
+            logln(String.format("beginning testing of %s", testName));
+
+            RBBIMonkeyImpl test = new RBBIMonkeyImpl();
+
+            test.fDumpExpansions = dumpExpansions;
+            test.fVerbose = verbose;
+            test.fRandomGenerator = new ICU_Rand(seed);
+            test.fLoopCount = loopCount;
+            test.setup(testName);
+
+            test.start();
+            startedTests.add(test);
+        }
+
+        StringBuilder errors = new StringBuilder();
+        for (RBBIMonkeyImpl test: startedTests) {
+            try {
+                test.join();
+                errors.append(test.fErrorMsgs);
+            } catch (InterruptedException e) {
+                errors.append(e + "\n");
+            }
+        }
+        String errorMsgs = errors.toString();
+        assertEquals(errorMsgs, "", errorMsgs);
+
+    }
+
+
+}
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java

index d4e87ab0f654c4f7579408d1701f197c752a83da..a93a8623e90843953469dc20f4fa8bb404724620 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
@@ -9,7 +9,10 @@
  package com.ibm.icu.dev.test.rbbi;
  
  
-// Monkey testing of RuleBasedBreakIterator
+// Monkey testing of RuleBasedBreakIterator.
+//    The old, original monkey test. TODO: remove
+//    The new monkey test is class RBBIMonkeyTest.
+
  import java.util.ArrayList;
  import java.util.Arrays;
  import java.util.List;
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/grapheme.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/grapheme.txt

new file mode 100644 (file)

index 0000000..0b551ba
--- /dev/null
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/grapheme.txt
@@ -0,0 +1,69 @@
+#
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
+
+# file: grapheme.txt
+#
+# Reference Grapheme Break rules for intltest rbbi/RBBIMonkeyTest
+#
+#
+# Note: Rule syntax and the monkey test itself are still a work in progress.
+#       They are expected to change with review and the addition of support for rule tailoring.
+
+type = grapheme;      # one of grapheme | word | line | sentence
+locale = en;
+
+CR                 = [\p{Grapheme_Cluster_Break = CR}];
+LF                 = [\p{Grapheme_Cluster_Break = LF}];
+
+Control            = [[\p{Grapheme_Cluster_Break = Control}]];
+Extend             = [[\p{Grapheme_Cluster_Break = Extend}]];
+ZWJ                = [\p{Grapheme_Cluster_Break = ZWJ}];
+Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
+Prepend            = [\p{Grapheme_Cluster_Break = Prepend}];
+SpacingMark        = [\p{Grapheme_Cluster_Break = SpacingMark}];
+
+#
+# Korean Syllable Definitions
+#
+L                  = [\p{Grapheme_Cluster_Break = L}];
+V                  = [\p{Grapheme_Cluster_Break = V}];
+T                  = [\p{Grapheme_Cluster_Break = T}];
+LV                 = [\p{Grapheme_Cluster_Break = LV}];
+LVT                = [\p{Grapheme_Cluster_Break = LVT}];
+
+# Emoji defintions
+
+EmojiNRK           = [[\p{Emoji}] - [Regional_Indicator\u002a\u00230-9©®™〰〽]];
+E_Base             = [\p{Grapheme_Cluster_Break = EB}];
+E_Modifier         = [\p{Grapheme_Cluster_Break = EM}];
+E_Base_GAZ         = [\p{Grapheme_Cluster_Break = EBG}];
+
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
+Extended_Pict         = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
+
+
+GB3:     CR LF;
+GB4:     (Control | CR | LF) ÷;
+GB5:     . ÷ (Control | CR | LF);
+
+GB6:     L (L | V | LV | LVT);
+GB7:     (LV | V) (V | T);
+GB8:     (LVT | T) T;
+
+GB10:    (E_Base | E_Base_GAZ) Extend* E_Modifier;
+GB11:    (Extended_Pict | EmojiNRK) Extend* ZWJ (Extended_Pict | EmojiNRK);
+GB9:     . (Extend | ZWJ);
+
+GB9a:    . SpacingMark;
+GB9b:    Prepend .;
+
+# Regional Indicators, split into pairs.
+#      Note that a pair of RIs that is not followed by a third RI will fall into
+#      the normal rules for Extend, etc.
+#
+GB12:  Regional_Indicator Regional_Indicator ÷ Regional_Indicator;
+GB13:  Regional_Indicator Regional_Indicator;
+
+GB999:     . ÷;
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt

new file mode 100644 (file)

index 0000000..5059d2d
--- /dev/null
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt
@@ -0,0 +1,200 @@
+#
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
+
+# file: line.txt
+#
+# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
+#
+# Note: Rule syntax and the monkey test itself are still a work in progress.
+#       They are expected to change with review and the addition of support for rule tailoring.
+
+
+type = line;
+locale = en;
+
+
+AI = [:LineBreak =  Ambiguous:];
+AL = [:LineBreak =  Alphabetic:];
+BA = [:LineBreak =  Break_After:];
+BB = [:LineBreak =  Break_Before:];
+BK = [:LineBreak =  Mandatory_Break:];
+B2 = [:LineBreak =  Break_Both:];
+CB = [:LineBreak =  Contingent_Break:];
+CJ = [:LineBreak =  Conditional_Japanese_Starter:];
+CL = [:LineBreak =  Close_Punctuation:];
+CM = [:LineBreak =  Combining_Mark:];
+CP = [:LineBreak =  Close_Parenthesis:];
+CR = [:LineBreak =  Carriage_Return:];
+EB = [:LineBreak =  EB:];
+EM = [:LineBreak =  EM:];
+EX = [:LineBreak =  Exclamation:];
+GL = [:LineBreak =  Glue:];
+HL = [:LineBreak =  Hebrew_Letter:];
+HY = [:LineBreak =  Hyphen:];
+H2 = [:LineBreak =  H2:];
+H3 = [:LineBreak =  H3:];
+ID = [:LineBreak =  Ideographic:];
+IN = [:LineBreak =  Inseperable:];
+IS = [:LineBreak =  Infix_Numeric:];
+JL = [:LineBreak =  JL:];
+JV = [:LineBreak =  JV:];
+JT = [:LineBreak =  JT:];
+LF = [:LineBreak =  Line_Feed:];
+NL = [:LineBreak =  Next_Line:];
+NS = [[:LineBreak =  Nonstarter:] CJ];   # CSS Strict tailoring: CJ resolves to NS.
+NU = [:LineBreak =  Numeric:];
+OP = [:LineBreak =  Open_Punctuation:];
+PO = [:LineBreak =  Postfix_Numeric:];
+PR = [:LineBreak =  Prefix_Numeric:];
+QU = [:LineBreak =  Quotation:];
+RI = [:LineBreak =  Regional_Indicator:];
+SA = [:LineBreak =  Complex_Context:];
+SG = [:LineBreak =  Surrogate:];
+SP = [:LineBreak =  Space:];
+SY = [:LineBreak =  Break_Symbols:];
+WJ = [:LineBreak =  Word_Joiner:];
+XX = [:LineBreak =  Unknown:];
+ZW = [:LineBreak =  ZWSpace:];
+ZWJ = [:LineBreak =  ZWJ:];
+
+EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
+Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
+
+# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
+AL = [AL AI SG XX ];
+dictionary = SA;
+
+# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
+#         list it in the numerous rules that use CM.
+CM = [CM ZWJ];
+
+LB4:        BK ÷;
+LB5:        CR LF;
+LB5.1:      CR ÷;
+LB5.2:      LF ÷;
+LB5.3:      NL ÷;
+
+LB6:        . (BK | CR | LF | NL);
+LB6.1:      [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
+
+# Rules LB14 - LB17.
+# Moved before LB7, because they can match a longer sequence that would also match LB7,
+# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
+#                           "while only the prefix "OP CM SP" matches LB7.1
+LB14:        OP CM* SP* .;
+LB15:        QU CM* SP* OP;
+LB16:        (CL | CP)CM* SP* NS;
+LB17:        B2 CM* SP* B2;
+
+LB7.1:      [^ZW SP] CM* [SP ZW];
+LB7.2:      [ZW SP] [SP ZW];
+
+# LB8, ICU differs from UAX-14,
+#    ICU:    ZW ÷;
+#    UAX 14: ZW SP* ÷;
+LB8:        ZW ÷;
+
+# LB8a
+#      ZWJ x (ID | Extended_Pict | EmojiNRK)
+LB8a:       ZWJ (ID | Extended_Pict | EmojiNRK);
+
+
+# LB9:  X CM -> X
+# LB10: Unattached CM -> AL
+
+#LB11:       × WJ;
+#            WJ ×
+
+LB11.1:      [^BK CR LF NL SP ZW] CM* WJ;
+LB11.2:      SP WJ;
+LB11.3:      WJ CM* [^CM];
+
+LB12:        GL CM* [^CM];
+
+LB12a:       [^SP BA HY] CM* GL;
+
+# LB 13 ICU Tailoring, matches tailoring example 8 from UAX 14.
+#
+#   LB13.1   [^SP] CM* [CL CP EX IS SY]    # original UAX 14 rule.
+#   LB13.2   SP    CM* [CL CP EX IS SY]
+
+LB13.1: [^NU SP] CM* [CL CP IS SY];
+LB13.2: [^SP] CM* EX;
+LB13.2: SP [CL CP EX IS SY];
+
+
+# LB 14-17 are moved above LB 7.
+
+LB18:        SP ÷;
+
+LB19:        . CM* QU;
+LB19.1:      QU CM* [^CM];
+
+# LB 20   Break before and after CB.
+#         Interaction with LB8a:  ZWJ x ID is tricky because CM includes ZWJ.
+#                                 ZWJ acts like a CM to the left, combining with CB.
+#                                 ZWJ acts independently to the right, no break from ID by LB8a.
+LB20:        . CM* ÷ CB;
+LB20.1a:     CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB20.1b:      CB CM* ÷;
+
+# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
+#       not picking up the continuing match after the BA from 21a.
+LB21a:       HL CM* (HY | BA) CM* [^CM CB];
+
+LB21.1:      . CM* [BA HY NS];
+LB21.2:      BB CM* [^CM CB];
+
+LB21b:       SY CM* HL;
+
+LB22.1:        (AL | HL | CM) CM* IN;   # The CM is from LB10, treat an unattached CM as AL.
+LB22.2:       EX CM* IN;
+LB22.3:       (ID | EB | EM) CM* IN;
+LB22.4:       IN CM* IN;
+LB22.5:       NU CM* IN;
+
+LB23.1:      (AL | HL | CM) CM* NU;
+LB23.2:      NU CM* (AL | HL);
+
+LB23a.1:     PR CM* (ID | EB | EM);
+LB23a.2:     (ID | EB | EM) CM* PO;
+
+LB24.2:      (PR | PO) CM* (AL | HL);
+LB24.3:      (AL | HL | CM) CM* (PR | PO);
+
+# Numbers. Equivalent to Tailoring example 8 from UAX 14.
+LB25:        ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
+
+LB26.1:      JL CM* (JL | JV | H2 | H3);
+LB26.2:      (JV | H2) CM* (JV | JT);
+LB26.3:      (JT | H3) CM* JT;
+
+LB27.1:      (JL | JV | JT | H2 | H3) CM* IN;
+LB27.2:      (JL | JV | JT | H2 | H3) CM* PO;
+LB27.3:      PR CM* (JL | JV | JT | H2 | H3);
+
+# LB28 Do not break between Alphabetics.
+#      Unattached (leading) CM treated as AL.
+LB28:        (AL | HL | CM)CM* (AL | HL);
+
+LB29:        IS CM* (AL | HL);
+
+# LB30  is adjusted for unattached leading CM being treated as AL.
+LB30.1:      (AL | CM | HL | NU) CM* OP;
+LB30.2:      CP CM* (AL | HL | NU);
+
+# LB31  keep pairs of RI together.
+LB30a.1:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
+LB30a.2:     RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB30a.3:     RI CM* RI CM* ÷;
+
+# LB30b Do not break between Emoji Base and Emoji Modifier
+LB30b:       EB CM* EM;
+
+# LB31 Break Everywhere Else.
+#      Include combining marks
+LB31.1:        . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB31.2:        . CM* ÷;
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt

new file mode 100644 (file)

index 0000000..a25e9dc
--- /dev/null
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt
@@ -0,0 +1,208 @@
+#
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
+#
+#  file:  line_loose.txt
+#
+# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
+#
+# Note: Rule syntax and the monkey test itself are still a work in progress.
+#       They are expected to change with review and the addition of support for rule tailoring.
+#
+#         This tailors the line break behavior to correspond to CSS
+#         line-break=loose (BCP47 -u-lb-loose) as defined for languages other than
+#         Chinese & Japanese.
+#         It sets characters of class CJ to behave like ID.
+#         In addition, it allows breaks:
+#         * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
+#         * between characters of LineBreak class IN
+
+type = line;
+locale = en@lb=loose;
+
+
+AI = [:LineBreak =  Ambiguous:];
+AL = [:LineBreak =  Alphabetic:];
+BA = [:LineBreak =  Break_After:];
+BB = [:LineBreak =  Break_Before:];
+BK = [:LineBreak =  Mandatory_Break:];
+B2 = [:LineBreak =  Break_Both:];
+CB = [:LineBreak =  Contingent_Break:];
+CJ = [:LineBreak =  Conditional_Japanese_Starter:];
+CL = [:LineBreak =  Close_Punctuation:];
+CM = [:LineBreak =  Combining_Mark:];
+CP = [:LineBreak =  Close_Parenthesis:];
+CR = [:LineBreak =  Carriage_Return:];
+EB = [:LineBreak =  EB:];
+EM = [:LineBreak =  EM:];
+EX = [:LineBreak =  Exclamation:];
+GL = [:LineBreak =  Glue:];
+HL = [:LineBreak =  Hebrew_Letter:];
+HY = [:LineBreak =  Hyphen:];
+H2 = [:LineBreak =  H2:];
+H3 = [:LineBreak =  H3:];
+ID = [[:LineBreak =  Ideographic:] CJ];  # CSS Normal tailoring: CJ resolves to ID
+IN = [:LineBreak =  Inseperable:];
+IS = [:LineBreak =  Infix_Numeric:];
+JL = [:LineBreak =  JL:];
+JV = [:LineBreak =  JV:];
+JT = [:LineBreak =  JT:];
+LF = [:LineBreak =  Line_Feed:];
+NL = [:LineBreak =  Next_Line:];
+NSX = [\u3005 \u303B \u309D \u309E \u30FD \u30FE];
+NS = [[:LineBreak =  Nonstarter:] - NSX];
+NU = [:LineBreak =  Numeric:];
+OP = [:LineBreak =  Open_Punctuation:];
+PO = [:LineBreak =  Postfix_Numeric:];
+PR = [:LineBreak =  Prefix_Numeric:];
+QU = [:LineBreak =  Quotation:];
+RI = [:LineBreak =  Regional_Indicator:];
+SA = [:LineBreak =  Complex_Context:];
+SG = [:LineBreak =  Surrogate:];
+SP = [:LineBreak =  Space:];
+SY = [:LineBreak =  Break_Symbols:];
+WJ = [:LineBreak =  Word_Joiner:];
+XX = [:LineBreak =  Unknown:];
+ZW = [:LineBreak =  ZWSpace:];
+ZWJ = [:LineBreak =  ZWJ:];
+
+EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
+Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
+
+# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
+AL = [AL AI SG XX ];
+dictionary = SA;
+
+# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
+#         list it in the numerous rules that use CM.
+CM = [CM ZWJ];
+
+LB4:        BK ÷;
+LB5:        CR LF;
+LB5.1:      CR ÷;
+LB5.2:      LF ÷;
+LB5.3:      NL ÷;
+
+LB6:        . (BK | CR | LF | NL);
+LB6.1:      [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
+
+# Rules LB14 - LB17.
+# Moved before LB7, because they can match a longer sequence that would also match LB7,
+# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
+#                           "while only the prefix "OP CM SP" matches LB7.1
+LB14:        OP CM* SP* .;
+LB15:        QU CM* SP* OP;
+LB16:        (CL | CP)CM* SP* NS;
+LB17:        B2 CM* SP* B2;
+
+LB7.1:      [^ZW SP] CM* [SP ZW];
+LB7.2:      [ZW SP] [SP ZW];
+
+# LB8, ICU differs from UAX-14,
+#    ICU:    ZW ÷;
+#    UAX 14: ZW SP* ÷;
+LB8:        ZW ÷;
+
+# LB8a
+#      ZWJ x (ID | Extended_Pict | EmojiNRK)
+LB8a:       ZWJ (ID | Extended_Pict | EmojiNRK);
+
+
+# LB9:  X CM -> X
+# LB10: Unattached CM -> AL
+
+#LB11:       × WJ;
+#            WJ ×
+
+LB11.1:      [^BK CR LF NL SP ZW] CM* WJ;
+LB11.2:      SP WJ;
+LB11.3:      WJ CM* [^CM];
+
+LB12:        GL CM* [^CM];
+
+LB12a:       [^SP BA HY] CM* GL;
+
+# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
+#
+#   LB13.1   [^SP] CM* [CL CP EX IS SY]    # original UAX 14 rule.
+#   LB13.2   SP    CM* [CL CP EX IS SY]
+
+LB13.1: [^NU SP] CM* [CL CP IS SY];
+LB13.2: [^SP] CM* EX;
+LB13.2: SP [CL CP EX IS SY];
+
+
+# LB 14-17 are moved above LB 7.
+
+LB18:        SP ÷;
+
+LB19:        . CM* QU;
+LB19.1:      QU CM* [^CM];
+
+# LB 20   Break before and after CB.
+#         Interaction with LB8a:  ZWJ x ID is tricky because CM includes ZWJ.
+#                                 ZWJ acts like a CM to the left, combining with CB.
+#                                 ZWJ acts independently to the right, no break from ID by LB8a.
+LB20:        . CM* ÷ CB;
+LB20.1a:     CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB20.1b:      CB CM* ÷;
+
+# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
+#       not picking up the continuing match after the BA from 21a.
+LB21a:       HL CM* (HY | BA) CM* [^CM CB];
+
+LB21.1:      . CM* [BA HY NS];
+LB21.2:      BB CM* [^CM CB];
+
+LB21b:       SY CM* HL;
+
+LB22.1:        (AL | HL | CM) CM* IN;   # The CM is from LB10, treat an unattached CM as AL.
+LB22.2:       EX CM* IN;
+LB22.3:       (ID | EB | EM) CM* IN;
+# LB22.4:       IN CM* IN;  # delete this rule for CSS loose.
+LB22.5:       NU CM* IN;
+
+LB23.1:      (AL | HL | CM) CM* NU;
+LB23.2:      NU CM* (AL | HL);
+
+LB23a.1:     PR CM* (ID | EB | EM);
+LB23a.2:     (ID | EB | EM) CM* PO;
+
+LB24.2:      (PR | PO) CM* (AL | HL);
+LB24.3:      (AL | HL | CM) CM* (PR | PO);
+
+# Numbers. Equivalent to Tailoring example 8 from UAx 14.
+LB25:        ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
+
+LB26.1:      JL CM* (JL | JV | H2 | H3);
+LB26.2:      (JV | H2) CM* (JV | JT);
+LB26.3:      (JT | H3) CM* JT;
+
+LB27.1:      (JL | JV | JT | H2 | H3) CM* IN;
+LB27.2:      (JL | JV | JT | H2 | H3) CM* PO;
+LB27.3:      PR CM* (JL | JV | JT | H2 | H3);
+
+# LB28 Do not break between Alphabetics.
+#      Unattached (leading) CM treated as AL.
+LB28:        (AL | HL | CM)CM* (AL | HL);
+
+LB29:        IS CM* (AL | HL);
+
+# LB30  is adjusted for unattached leading CM being treated as AL.
+LB30.1:      (AL | CM | HL | NU) CM* OP;
+LB30.2:      CP CM* (AL | HL | NU);
+
+# LB31  keep pairs of RI together.
+LB30a.1:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
+LB30a.2:     RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB30a.3:     RI CM* RI CM* ÷;
+
+# LB30b Do not break between Emoji Base and Emoji Modifier
+LB30b:       EB CM* EM;
+
+# LB31 Break Everywhere Else.
+#      Include combining marks
+LB31.1:        . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB31.2:        . CM* ÷;
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt

new file mode 100644 (file)

index 0000000..14458cf
--- /dev/null
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt
@@ -0,0 +1,229 @@
+#
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
+#
+#  file:  line_loose_cj.txt
+#
+# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
+#
+# Note: Rule syntax and the monkey test itself are still a work in progress.
+#       They are expected to change with review and the addition of support for rule tailoring.
+#
+#         Line Breaking Rules
+#         Implement default line breaking as defined by
+#         Unicode Standard Annex #14 Revision 34 for Unicode 8.0
+#         http://www.unicode.org/reports/tr14/
+#         tailored as noted in 2nd paragraph below..
+#
+#         This tailors the line break behavior to correspond to CSS
+#         line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
+#         It sets characters of class CJ to behave like ID.
+#         In addition, it allows breaks:
+#         * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
+#         * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
+#         * between characters of LineBreak class IN such as 2026
+#         * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
+#           FF65 (all NS) and FF01, FF1F (both EX).
+#         * before suffix characters with LineBreak class PO and EastAsianWidth A,F,W;
+#           this includes: 00B0 2030 2032 2033 2035 2103 2109 FE6A FF05 FFE0
+#         * after prefix characters with LineBreak class PR and EastAsianWidth A,F,W;
+#           this includes: 00A4 00B1 20AC 2116 FE69 FF04 FFE1 FFE5 FFE6
+
+
+type = line;
+locale = ja@lb=loose;
+
+
+AI = [:LineBreak =  Ambiguous:];
+AL = [[:LineBreak =  Alphabetic:]];
+BAX = [\u2010 \u2013];
+BA = [[:LineBreak =  Break_After:] - BAX];
+BB = [:LineBreak =  Break_Before:];
+BK = [:LineBreak =  Mandatory_Break:];
+B2 = [:LineBreak =  Break_Both:];
+CB = [:LineBreak =  Contingent_Break:];
+CJ = [:LineBreak =  Conditional_Japanese_Starter:];
+CL = [:LineBreak =  Close_Punctuation:];
+CM = [:LineBreak =  Combining_Mark:];
+CP = [:LineBreak =  Close_Parenthesis:];
+CR = [:LineBreak =  Carriage_Return:];
+EB = [:LineBreak =  EB:];
+EM = [:LineBreak =  EM:];
+EXX = [\uFF01 \uFF1F];
+EX = [[:LineBreak =  Exclamation:] - EXX];
+GL = [:LineBreak =  Glue:];
+HL = [:LineBreak =  Hebrew_Letter:];
+HY = [:LineBreak =  Hyphen:];
+H2 = [:LineBreak =  H2:];
+H3 = [:LineBreak =  H3:];
+ID = [[:LineBreak =  Ideographic:] CJ];  # CSS Loose tailoring: CJ resolves to ID
+IN = [:LineBreak =  Inseperable:];
+IS = [:LineBreak =  Infix_Numeric:];
+JL = [:LineBreak =  JL:];
+JV = [:LineBreak =  JV:];
+JT = [:LineBreak =  JT:];
+LF = [:LineBreak =  Line_Feed:];
+NL = [:LineBreak =  Next_Line:];
+NSX = [\u301C \u30A0 \u3005 \u303B \u309D \u309E \u30FD \u30FE \u203C \u2047 \u2048 \u2049 \u30FB \uFF1A \uFF1B \uFF65];
+NS = [[:LineBreak =  Nonstarter:] - NSX];
+NU = [:LineBreak =  Numeric:];
+OP = [:LineBreak =  Open_Punctuation:];
+POX = [\u00B0 \u2030 \u2032 \u2033 \u2035 \u2103 \u2109 \uFE6A \uFF05 \uFFE0];
+PO = [[:LineBreak =  Postfix_Numeric:] - POX];
+PRX = [\u00A4 \u00B1 \u20AC \u2116 \uFE69 \uFF04 \uFFE1 \uFFE5 \uFFE6];
+PR = [[:LineBreak =  Prefix_Numeric:] - PRX];
+QU = [:LineBreak =  Quotation:];
+RI = [:LineBreak =  Regional_Indicator:];
+SA = [:LineBreak =  Complex_Context:];
+SG = [:LineBreak =  Surrogate:];
+SP = [:LineBreak =  Space:];
+SY = [:LineBreak =  Break_Symbols:];
+WJ = [:LineBreak =  Word_Joiner:];
+XX = [:LineBreak =  Unknown:];
+ZW = [:LineBreak =  ZWSpace:];
+ZWJ = [:LineBreak =  ZWJ:];
+
+EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
+Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
+
+# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
+AL = [AL AI SG XX ];
+dictionary = SA;
+
+# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
+#         list it in the numerous rules that use CM.
+CM = [CM ZWJ];
+
+LB4:        BK ÷;
+LB5:        CR LF;
+LB5.1:      CR ÷;
+LB5.2:      LF ÷;
+LB5.3:      NL ÷;
+
+LB6:        . (BK | CR | LF | NL);
+LB6.1:      [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
+
+# Rules LB14 - LB17.
+# Moved before LB7, because they can match a longer sequence that would also match LB7,
+# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
+#                           "while only the prefix "OP CM SP" matches LB7.1
+LB14:        OP CM* SP* .;
+LB15:        QU CM* SP* OP;
+LB16:        (CL | CP)CM* SP* NS;
+LB17:        B2 CM* SP* B2;
+
+LB7.1:      [^ZW SP] CM* [SP ZW];
+LB7.2:      [ZW SP] [SP ZW];
+
+# LB8, ICU differs from UAX-14,
+#    ICU:    ZW ÷;
+#    UAX 14: ZW SP* ÷;
+LB8:        ZW ÷;
+
+# LB8a
+#      ZWJ x (ID | Extended_Pict | EmojiNRK)
+LB8a:       ZWJ (ID | Extended_Pict | EmojiNRK);
+
+
+# LB9:  X CM -> X
+# LB10: Unattached CM -> AL
+
+#LB11:       × WJ;
+#            WJ ×
+
+LB11.1:      [^BK CR LF NL SP ZW] CM* WJ;
+LB11.2:      SP WJ;
+LB11.3:      WJ CM* [^CM];
+
+LB12:        GL CM* [^CM];
+
+LB12a:       [^SP BA BAX HY] CM* GL;
+
+# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
+#
+#   LB13.1   [^SP] CM* [CL CP EX IS SY]    # original UAX 14 rule.
+#   LB13.2   SP    CM* [CL CP EX IS SY]
+
+LB13.1: [^NU SP] CM* [CL CP IS SY];
+LB13.2: [^SP] CM* EX;
+LB13.2: SP [CL CP EX IS SY];
+
+
+# LB 14-17 are moved above LB 7.
+
+LB18:        SP ÷;
+
+LB19:        . CM* QU;
+LB19.1:      QU CM* [^CM];
+
+# LB 20   Break before and after CB.
+#         Interaction with LB8a:  ZWJ x ID is tricky because CM includes ZWJ.
+#                                 ZWJ acts like a CM to the left, combining with CB.
+#                                 ZWJ acts independently to the right, no break from ID by LB8a.
+LB20:        . CM* ÷ CB;
+LB20.1a:     CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB20.1b:      CB CM* ÷;
+
+# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
+#       not picking up the continuing match after the BA from 21a.
+# LB 21a Don't break after Hebrew + Hyphen
+#   HL (HY | BA) x
+
+LB21a:       HL CM* (HY | BA | BAX) CM* [^CM CB]?;
+
+LB21.1:      . CM* [BA HY NS];
+LB21.2:      BB CM* [^CM CB];
+
+LB21b:       SY CM* HL;
+
+LB22.1:        (AL | HL | CM) CM* IN;   # The CM is from LB10, treat an unattached CM as AL.
+LB22.2:       EX CM* IN;
+LB22.3:       (ID | EB | EM) CM* IN;
+# LB22.4:       IN CM* IN;  # delete this rule for CSS loose.
+LB22.5:       NU CM* IN;
+
+LB23.1:      (AL | HL | CM) CM* NU;
+LB23.2:      NU CM* (AL | HL);
+
+LB23a.1:     PR CM* (ID | EB | EM);
+LB23a.2:     (ID | EB | EM) CM* PO;
+
+LB24.2:      (PR | PO | POX) CM* (AL | HL);
+LB24.3:      (AL | HL | CM) CM* (PR | PO | POX);
+
+# Numbers. Equivalent to Tailoring example 8 from UAx 14.
+#          Loose_cj tailoring: do not include $PRX at the beginning or $POX at the end.
+LB25:        ((PR | PO | POX)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PRX | PO))?;
+
+LB26.1:      JL CM* (JL | JV | H2 | H3);
+LB26.2:      (JV | H2) CM* (JV | JT);
+LB26.3:      (JT | H3) CM* JT;
+
+LB27.1:      (JL | JV | JT | H2 | H3) CM* IN;
+LB27.2:      (JL | JV | JT | H2 | H3) CM* PO;
+LB27.3:      PR CM* (JL | JV | JT | H2 | H3);
+
+# LB28 Do not break between Alphabetics.
+#      Unattached (leading) CM treated as AL.
+LB28:        (AL | HL | CM)CM* (AL | HL);
+
+LB29:        IS CM* (AL | HL);
+
+# LB30  is adjusted for unattached leading CM being treated as AL.
+LB30.1:      (AL | CM | HL | NU) CM* OP;
+LB30.2:      CP CM* (AL | HL | NU);
+
+# LB31  keep pairs of RI together.
+LB30a.1:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
+LB30a.2:     RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB30a.3:     RI CM* RI CM* ÷;
+
+# LB30b Do not break between Emoji Base and Emoji Modifier
+LB30b:       EB CM* EM;
+
+# LB31 Break Everywhere Else.
+#      Include combining marks
+LB31.1:        . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB31.2:        . CM* ÷;
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt

new file mode 100644 (file)

index 0000000..a2e0bc5
--- /dev/null
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt
@@ -0,0 +1,214 @@
+#
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
+#
+# file: line_normal.txt
+#
+# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
+#
+# Note: Rule syntax and the monkey test itself are still a work in progress.
+#       They are expected to change with review and the addition of support for rule tailoring.
+#
+#         Line Breaking Rules
+#         Implement default line breaking as defined by
+#         Unicode Standard Annex #14 Revision 34 for Unicode 8.0
+#         http://www.unicode.org/reports/tr14/
+#         tailored as noted in 2nd paragraph below.
+#
+#         TODO:  Rule LB 8 remains as it was in Unicode 5.2
+#         This is only because of a limitation of ICU break engine implementation,
+#         not because the older behavior is desirable.
+#
+#         This tailors the line break behavior to correspond to CSS
+#         line-break=normal (BCP47 -u-lb-normal) as defined for languages other than
+#         Chinese & Japanese.
+#         It sets characters of class CJ to behave like ID.
+
+
+type = line;
+locale = en@lb=normal;
+
+AI = [:LineBreak =  Ambiguous:];
+AL = [:LineBreak =  Alphabetic:];
+BA = [:LineBreak =  Break_After:];
+BB = [:LineBreak =  Break_Before:];
+BK = [:LineBreak =  Mandatory_Break:];
+B2 = [:LineBreak =  Break_Both:];
+CB = [:LineBreak =  Contingent_Break:];
+CJ = [:LineBreak =  Conditional_Japanese_Starter:];
+CL = [:LineBreak =  Close_Punctuation:];
+CM = [:LineBreak =  Combining_Mark:];
+CP = [:LineBreak =  Close_Parenthesis:];
+CR = [:LineBreak =  Carriage_Return:];
+EB = [:LineBreak =  EB:];
+EM = [:LineBreak =  EM:];
+EX = [:LineBreak =  Exclamation:];
+GL = [:LineBreak =  Glue:];
+HL = [:LineBreak =  Hebrew_Letter:];
+HY = [:LineBreak =  Hyphen:];
+H2 = [:LineBreak =  H2:];
+H3 = [:LineBreak =  H3:];
+ID = [[:LineBreak =  Ideographic:] CJ];  # CSS Normal tailoring: CJ resolves to ID
+IN = [:LineBreak =  Inseperable:];
+IS = [:LineBreak =  Infix_Numeric:];
+JL = [:LineBreak =  JL:];
+JV = [:LineBreak =  JV:];
+JT = [:LineBreak =  JT:];
+LF = [:LineBreak =  Line_Feed:];
+NL = [:LineBreak =  Next_Line:];
+NS = [:LineBreak =  Nonstarter:];
+NU = [:LineBreak =  Numeric:];
+OP = [:LineBreak =  Open_Punctuation:];
+PO = [:LineBreak =  Postfix_Numeric:];
+PR = [:LineBreak =  Prefix_Numeric:];
+QU = [:LineBreak =  Quotation:];
+RI = [:LineBreak =  Regional_Indicator:];
+SA = [:LineBreak =  Complex_Context:];
+SG = [:LineBreak =  Surrogate:];
+SP = [:LineBreak =  Space:];
+SY = [:LineBreak =  Break_Symbols:];
+WJ = [:LineBreak =  Word_Joiner:];
+XX = [:LineBreak =  Unknown:];
+ZW = [:LineBreak =  ZWSpace:];
+ZWJ = [:LineBreak =  ZWJ:];
+
+EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
+Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
+
+# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
+AL = [AL AI SG XX ];
+dictionary = SA;
+
+# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
+#         list it in the numerous rules that use CM.
+CM = [CM ZWJ];
+
+LB4:        BK ÷;
+LB5:        CR LF;
+LB5.1:      CR ÷;
+LB5.2:      LF ÷;
+LB5.3:      NL ÷;
+
+LB6:        . (BK | CR | LF | NL);
+LB6.1:      [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
+
+# Rules LB14 - LB17.
+# Moved before LB7, because they can match a longer sequence that would also match LB7,
+# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
+#                           "while only the prefix "OP CM SP" matches LB7.1
+LB14:        OP CM* SP* .;
+LB15:        QU CM* SP* OP;
+LB16:        (CL | CP)CM* SP* NS;
+LB17:        B2 CM* SP* B2;
+
+LB7.1:      [^ZW SP] CM* [SP ZW];
+LB7.2:      [ZW SP] [SP ZW];
+
+# LB8, ICU differs from UAX-14,
+#    ICU:    ZW ÷;
+#    UAX 14: ZW SP* ÷;
+LB8:        ZW ÷;
+
+# LB8a
+#      ZWJ x (ID | Extended_Pict | EmojiNRK)
+LB8a:       ZWJ (ID | Extended_Pict | EmojiNRK);
+
+
+# LB9:  X CM -> X
+# LB10: Unattached CM -> AL
+
+#LB11:       × WJ;
+#            WJ ×
+
+LB11.1:      [^BK CR LF NL SP ZW] CM* WJ;
+LB11.2:      SP WJ;
+LB11.3:      WJ CM* [^CM];
+
+LB12:        GL CM* [^CM];
+
+LB12a:       [^SP BA HY] CM* GL;
+
+# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
+#
+#   LB13.1   [^SP] CM* [CL CP EX IS SY]    # original UAX 14 rule.
+#   LB13.2   SP    CM* [CL CP EX IS SY]
+
+LB13.1: [^NU SP] CM* [CL CP IS SY];
+LB13.2: [^SP] CM* EX;
+LB13.2: SP [CL CP EX IS SY];
+
+
+# LB 14-17 are moved above LB 7.
+
+LB18:        SP ÷;
+
+LB19:        . CM* QU;
+LB19.1:      QU CM* [^CM];
+
+# LB 20   Break before and after CB.
+#         Interaction with LB8a:  ZWJ x ID is tricky because CM includes ZWJ.
+#                                 ZWJ acts like a CM to the left, combining with CB.
+#                                 ZWJ acts independently to the right, no break from ID by LB8a.
+LB20:        . CM* ÷ CB;
+LB20.1a:     CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB20.1b:      CB CM* ÷;
+
+# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
+#       not picking up the continuing match after the BA from 21a.
+LB21a:       HL CM* (HY | BA) CM* [^CM CB];
+
+LB21.1:      . CM* [BA HY NS];
+LB21.2:      BB CM* [^CM CB];
+
+LB21b:       SY CM* HL;
+
+LB22.1:        (AL | HL | CM) CM* IN;   # The CM is from LB10, treat an unattached CM as AL.
+LB22.2:       EX CM* IN;
+LB22.3:       (ID | EB | EM) CM* IN;
+LB22.4:       IN CM* IN;
+LB22.5:       NU CM* IN;
+
+LB23.1:      (AL | HL | CM) CM* NU;
+LB23.2:      NU CM* (AL | HL);
+
+LB23a.1:     PR CM* (ID | EB | EM);
+LB23a.2:     (ID | EB | EM) CM* PO;
+
+LB24.2:      (PR | PO) CM* (AL | HL);
+LB24.3:      (AL | HL | CM) CM* (PR | PO);
+
+# Numbers. Equivalent to Tailoring example 8 from UAx 14.
+LB25:        ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
+
+LB26.1:      JL CM* (JL | JV | H2 | H3);
+LB26.2:      (JV | H2) CM* (JV | JT);
+LB26.3:      (JT | H3) CM* JT;
+
+LB27.1:      (JL | JV | JT | H2 | H3) CM* IN;
+LB27.2:      (JL | JV | JT | H2 | H3) CM* PO;
+LB27.3:      PR CM* (JL | JV | JT | H2 | H3);
+
+# LB28 Do not break between Alphabetics.
+#      Unattached (leading) CM treated as AL.
+LB28:        (AL | HL | CM)CM* (AL | HL);
+
+LB29:        IS CM* (AL | HL);
+
+# LB30  is adjusted for unattached leading CM being treated as AL.
+LB30.1:      (AL | CM | HL | NU) CM* OP;
+LB30.2:      CP CM* (AL | HL | NU);
+
+# LB31  keep pairs of RI together.
+LB30a.1:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
+LB30a.2:     RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB30a.3:     RI CM* RI CM* ÷;
+
+# LB30b Do not break between Emoji Base and Emoji Modifier
+LB30b:       EB CM* EM;
+
+# LB31 Break Everywhere Else.
+#      Include combining marks
+LB31.1:        . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB31.2:        . CM* ÷;
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt

new file mode 100644 (file)

index 0000000..388cd03
--- /dev/null
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt
@@ -0,0 +1,223 @@
+#
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (c) 2016  International Business Machines Corporation and others. All Rights Reserved.
+#
+#  file:  line_normal_cj.txt
+#
+# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
+#
+# Note: Rule syntax and the monkey test itself are still a work in progress.
+#       They are expected to change with review and the addition of support for rule tailoring.
+#
+#         Line Breaking Rules
+#         Implement default line breaking as defined by
+#         Unicode Standard Annex #14 Revision 34 for Unicode 8.0
+#         http://www.unicode.org/reports/tr14/
+#         tailored as noted in 2nd paragraph below.
+#
+#         TODO:  Rule LB 8 remains as it was in Unicode 5.2
+#         This is only because of a limitation of ICU break engine implementation,
+#         not because the older behavior is desirable.
+#
+#         This tailors the line break behavior to correspond to CSS
+#         line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
+#         It sets characters of class CJ to behave like ID.
+#         In addition, it allows breaks:
+#         * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
+
+type = line;
+locale = ja@lb=normal;
+
+AI = [:LineBreak =  Ambiguous:];
+AL = [:LineBreak =  Alphabetic:];
+BAX = [\u2010 \u2013];
+BA = [[:LineBreak =  Break_After:] - BAX];
+BB = [:LineBreak =  Break_Before:];
+BK = [:LineBreak =  Mandatory_Break:];
+B2 = [:LineBreak =  Break_Both:];
+CB = [:LineBreak =  Contingent_Break:];
+CJ = [:LineBreak =  Conditional_Japanese_Starter:];
+CL = [:LineBreak =  Close_Punctuation:];
+CM = [:LineBreak =  Combining_Mark:];
+CP = [:LineBreak =  Close_Parenthesis:];
+CR = [:LineBreak =  Carriage_Return:];
+EB = [:LineBreak =  EB:];
+EM = [:LineBreak =  EM:];
+EX = [:LineBreak =  Exclamation:];
+GL = [:LineBreak =  Glue:];
+HL = [:LineBreak =  Hebrew_Letter:];
+HY = [:LineBreak =  Hyphen:];
+H2 = [:LineBreak =  H2:];
+H3 = [:LineBreak =  H3:];
+ID = [[:LineBreak =  Ideographic:] CJ];  # CSS Normal tailoring: CJ resolves to ID
+IN = [:LineBreak =  Inseperable:];
+IS = [:LineBreak =  Infix_Numeric:];
+JL = [:LineBreak =  JL:];
+JV = [:LineBreak =  JV:];
+JT = [:LineBreak =  JT:];
+LF = [:LineBreak =  Line_Feed:];
+NL = [:LineBreak =  Next_Line:];
+NSX = [\u301C \u30A0];
+NS = [[:LineBreak =  Nonstarter:] - NSX];
+NU = [:LineBreak =  Numeric:];
+OP = [:LineBreak =  Open_Punctuation:];
+PO = [:LineBreak =  Postfix_Numeric:];
+PR = [:LineBreak =  Prefix_Numeric:];
+QU = [:LineBreak =  Quotation:];
+RI = [:LineBreak =  Regional_Indicator:];
+SA = [:LineBreak =  Complex_Context:];
+SG = [:LineBreak =  Surrogate:];
+SP = [:LineBreak =  Space:];
+SY = [:LineBreak =  Break_Symbols:];
+WJ = [:LineBreak =  Word_Joiner:];
+XX = [:LineBreak =  Unknown:];
+ZW = [:LineBreak =  ZWSpace:];
+ZWJ = [:LineBreak =  ZWJ:];
+
+EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
+Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
+
+# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
+AL = [AL AI SG XX ];
+dictionary = SA;
+
+# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
+#         list it in the numerous rules that use CM.
+CM = [CM ZWJ];
+
+LB4:        BK ÷;
+LB5:        CR LF;
+LB5.1:      CR ÷;
+LB5.2:      LF ÷;
+LB5.3:      NL ÷;
+
+LB6:        . (BK | CR | LF | NL);
+LB6.1:      [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
+
+# Rules LB14 - LB17.
+# Moved before LB7, because they can match a longer sequence that would also match LB7,
+# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
+#                           "while only the prefix "OP CM SP" matches LB7.1
+LB14:        OP CM* SP* .;
+LB15:        QU CM* SP* OP;
+
+# Do not break between closing punctuation and $NS, even with intervening spaces
+# But DO allow a break between closing punctuation and $NSX, don't include it here
+LB16:        (CL | CP)CM* SP* NS;
+LB17:        B2 CM* SP* B2;
+
+LB7.1:      [^ZW SP] CM* [SP ZW];
+LB7.2:      [ZW SP] [SP ZW];
+
+# LB8, ICU differs from UAX-14,
+#    ICU:    ZW ÷;
+#    UAX 14: ZW SP* ÷;
+LB8:        ZW ÷;
+
+# LB8a
+#      ZWJ x (ID | Extended_Pict | EmojiNRK)
+LB8a:       ZWJ (ID | Extended_Pict | EmojiNRK);
+
+
+# LB9:  X CM -> X
+# LB10: Unattached CM -> AL
+
+#LB11:       × WJ;
+#            WJ ×
+
+LB11.1:      [^BK CR LF NL SP ZW] CM* WJ;
+LB11.2:      SP WJ;
+LB11.3:      WJ CM* [^CM];
+
+LB12:        GL CM* [^CM];
+
+LB12a:       [^SP BA BAX HY] CM* GL;
+
+# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
+#
+#   LB13.1   [^SP] CM* [CL CP EX IS SY]    # original UAX 14 rule.
+#   LB13.2   SP    CM* [CL CP EX IS SY]
+
+LB13.1: [^NU SP] CM* [CL CP IS SY];
+LB13.2: [^SP] CM* EX;
+LB13.2: SP [CL CP EX IS SY];
+
+
+# LB 14-17 are moved above LB 7.
+
+LB18:        SP ÷;
+
+LB19:        . CM* QU;
+LB19.1:      QU CM* [^CM];
+
+# LB 20   Break before and after CB.
+#         Interaction with LB8a:  ZWJ x ID is tricky because CM includes ZWJ.
+#                                 ZWJ acts like a CM to the left, combining with CB.
+#                                 ZWJ acts independently to the right, no break from ID by LB8a.
+LB20:        . CM* ÷ CB;
+LB20.1a:     CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB20.1b:      CB CM* ÷;
+
+# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
+#       not picking up the continuing match after the BA from 21a.
+# TODO: For CJ tailorings (with BAX) does this rule want to include BAX? If so,
+#       should "HL BAX" not break when followed by a CB? Thats what the current
+#       rules do, which is why "[^CM CB]?" includes the ?.
+LB21a:       HL CM* (HY | BA | BAX) CM* [^CM CB]?;
+
+# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
+LB21.1:      . CM* [BA HY NS];
+LB21.2:      BB CM* [^CM CB];
+
+LB21b:       SY CM* HL;
+
+LB22.1:        (AL | HL | CM) CM* IN;   # The CM is from LB10, treat an unattached CM as AL.
+LB22.2:       EX CM* IN;
+LB22.3:       (ID | EB | EM) CM* IN;
+LB22.4:       IN CM* IN;
+LB22.5:       NU CM* IN;
+
+LB23.1:      (AL | HL | CM) CM* NU;
+LB23.2:      NU CM* (AL | HL);
+
+LB23a.1:     PR CM* (ID | EB | EM);
+LB23a.2:     (ID | EB | EM) CM* PO;
+
+LB24.2:      (PR | PO) CM* (AL | HL);
+LB24.3:      (AL | HL | CM) CM* (PR | PO);
+
+# Numbers. Equivalent to Tailoring example 8 from UAx 14.
+LB25:        ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
+
+LB26.1:      JL CM* (JL | JV | H2 | H3);
+LB26.2:      (JV | H2) CM* (JV | JT);
+LB26.3:      (JT | H3) CM* JT;
+
+LB27.1:      (JL | JV | JT | H2 | H3) CM* IN;
+LB27.2:      (JL | JV | JT | H2 | H3) CM* PO;
+LB27.3:      PR CM* (JL | JV | JT | H2 | H3);
+
+# LB28 Do not break between Alphabetics.
+#      Unattached (leading) CM treated as AL.
+LB28:        (AL | HL | CM)CM* (AL | HL);
+
+LB29:        IS CM* (AL | HL);
+
+# LB30  is adjusted for unattached leading CM being treated as AL.
+LB30.1:      (AL | CM | HL | NU) CM* OP;
+LB30.2:      CP CM* (AL | HL | NU);
+
+# LB31  keep pairs of RI together.
+LB30a.1:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
+LB30a.2:     RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB30a.3:     RI CM* RI CM* ÷;
+
+# LB30b Do not break between Emoji Base and Emoji Modifier
+LB30b:       EB CM* EM;
+
+# LB31 Break Everywhere Else.
+#      Include combining marks
+LB31.1:        . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB31.2:        . CM* ÷;
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/readme.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/readme.txt

new file mode 100644 (file)

index 0000000..33da959
--- /dev/null
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/readme.txt
@@ -0,0 +1,10 @@
+file: main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/readme.txt
+Copyright (C) 2016 and later: Unicode, Inc. and others.
+License & terms of use: http://www.unicode.org/copyright.html#License
+
+Copyright (c) 2015-2016, International Business Machines Corporation and others. All Rights Reserved.
+
+This directory contains the break iterator reference rule files used by the test RBBIMonkeyTest.
+
+The rule files are copied from ICU4C, from source/test/testdata/break_rules/*
+See the readme.txt located there for additional information.
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/sentence.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/sentence.txt

new file mode 100644 (file)

index 0000000..ed0918f
--- /dev/null
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/sentence.txt
@@ -0,0 +1,50 @@
+#
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html#License
+
+# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
+# file: sentence.txt
+
+type = sentence;      # one of grapheme | word | line | sentence
+locale = en;
+
+CR        = [\p{Sentence_Break = CR}];
+LF        = [\p{Sentence_Break = LF}];
+Extend    = [\p{Sentence_Break = Extend}];
+Sep       = [\p{Sentence_Break = Sep}];
+Format    = [\p{Sentence_Break = Format}];
+Sp        = [\p{Sentence_Break = Sp}];
+Lower     = [\p{Sentence_Break = Lower}];
+Upper     = [\p{Sentence_Break = Upper}];
+OLetter   = [\p{Sentence_Break = OLetter}];
+Numeric   = [\p{Sentence_Break = Numeric}];
+ATerm     = [\p{Sentence_Break = ATerm}];
+SContinue = [\p{Sentence_Break = SContinue}];
+STerm     = [\p{Sentence_Break = STerm}];
+Close     = [\p{Sentence_Break = Close}];
+
+ParaSep   = [Sep CR LF];
+SATerm    = [STerm ATerm];
+ExtFmt    = [Extend Format];
+
+# SB2:  ÷  eot
+#       Conventional regular expression matching for '$' as end-of-text also matches
+#       at a line separator just preceding the physical end of text.
+#       Instead, use a look-ahead assertion that there is no following character.
+SB2:    . ÷ (?!.);
+
+SB3:    CR LF;
+SB4:    ParaSep ÷;
+
+# SB5: ignore Format and Extend characters.
+
+SB6:    ATerm ExtFmt* Numeric;
+SB7:    (Upper | Lower) ExtFmt* ATerm ExtFmt* Upper;
+SB8:    ATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* ([^OLetter Upper Lower ParaSep SATerm ExtFmt] ExtFmt *)* Lower;
+SB8a:   SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (SContinue | SATerm);
+
+SB9:    SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (CR LF | ParaSep)? ÷;
+        # Also covers SB10, SB11.
+
+SB12:   . ExtFmt* [^ExtFmt]?;
+
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/word.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/word.txt

new file mode 100644 (file)

index 0000000..fd9799c
--- /dev/null
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/word.txt
@@ -0,0 +1,97 @@
+#
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
+
+# file: word.txt
+#
+# Reference Word Break rules for intltest rbbi/RBBIMonkeyTest
+#
+# Note: Rule syntax and the monkey test itself are still a work in progress.
+#       They are expected to change with review and the addition of support for rule tailoring.
+
+
+type = word;      # one of grapheme | word | line | sentence
+locale = en;
+
+
+CR                 = [\p{Word_Break = CR}];
+LF                 = [\p{Word_Break = LF}];
+Newline            = [\p{Word_Break = Newline}];
+Extend             = [\p{Word_Break = Extend}];
+ZWJ                = [\p{Word_Break = ZWJ}];
+Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
+Format             = [\p{Word_Break = Format}];
+Katakana           = [\p{Word_Break = Katakana}];
+Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
+ALetter            = [\p{Word_Break = ALetter}];
+Single_Quote       = [\p{Word_Break = Single_Quote}];
+Double_Quote       = [\p{Word_Break = Double_Quote}];
+MidNumLet          = [\p{Word_Break = MidNumLet}];
+MidLetter          = [\p{Word_Break = MidLetter}];
+MidNum             = [\p{Word_Break = MidNum}];
+Numeric            = [\p{Word_Break = Numeric}];
+ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
+E_Base             = [\p{Word_Break = EB}];
+E_Modifier         = [\p{Word_Break = EM}];
+EmojiNRK           = [[\p{Emoji}] - [[Regional_Indicator]\u002a\u00230-9©®™〰〽]];
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
+Extended_Pict      = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
+EBG                = [\p{Word_Break = EBG}];
+
+#define dictionary, with the effect being that those characters don't appear in test data.
+
+Han            = [:Han:];
+Hiragana       = [:Hiragana:];
+
+Control        = [\p{Grapheme_Cluster_Break = Control}];
+HangulSyllable = [\uac00-\ud7a3];
+ComplexContext = [:LineBreak = Complex_Context:];
+KanaKanji      = [Han Hiragana Katakana];
+dictionaryCJK  = [KanaKanji HangulSyllable];
+dictionary     = [ComplexContext dictionaryCJK];
+
+# leave dictionary scripts out of ALetter
+
+ALetter        = [ALetter - dictionary];
+
+AHLetter       = [ALetter  Hebrew_Letter];
+MidNumLetQ     = [MidNumLet  Single_Quote];
+ExtFmt         = [Extend Format ZWJ];
+
+WB3:   CR LF;
+WB3a:  (Newline | CR | LF) ÷;
+WB3b:  . ÷ (Newline | CR | LF);   # actually redundant? No other rule combines.
+                                  # (but needed with UAX treat-as scheme.)
+WB3c:   ZWJ (Extended_Pict | EmojiNRK);
+
+WB5:    AHLetter ExtFmt* AHLetter;
+
+# includes both WB6 and WB7
+WB6:    AHLetter ExtFmt* (MidLetter | MidNumLetQ) ExtFmt*  AHLetter;
+
+WB7a:   Hebrew_Letter ExtFmt* Single_Quote;
+WB7b:   Hebrew_Letter ExtFmt* Double_Quote ExtFmt* Hebrew_Letter;   # Include WB7c
+
+WB8:    Numeric ExtFmt* Numeric;
+WB9:    AHLetter ExtFmt* Numeric;
+WB10:   Numeric ExtFmt* AHLetter;
+
+WB11:   Numeric ExtFmt* (MidNum | MidNumLetQ) ExtFmt* Numeric;    # includes WB12
+WB13:   Katakana ExtFmt* Katakana;
+
+WB13a:  (AHLetter | Numeric | Katakana | ExtendNumLet) ExtFmt* ExtendNumLet;
+WB13b:  ExtendNumLet ExtFmt* (AHLetter | Numeric | Katakana);
+
+# WB rule 15 - 17, pairs of Regional Indicators stay unbroken.
+#              Interacts with WB3c.
+WB15:  Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ (Extended_Pict | EmojiNRK);
+WB17:  Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ÷;
+
+WB14:  (E_Base | EBG) ExtFmt* E_Modifier;
+
+# Rule WB 999   Any ÷ Any
+#    Interacts with WB3c, do not break between ZWJ and (Extended_Pict | EBG).
+WB999.1: . ExtFmt* ZWJ (Extended_Pict | EmojiNRK);
+WB999.2: . ExtFmt* ÷;
+
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/word_POSIX.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/word_POSIX.txt

new file mode 100644 (file)

index 0000000..6e8be2c
--- /dev/null
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/word_POSIX.txt
@@ -0,0 +1,96 @@
+#
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
+
+# file: word_POSIX.txt
+#
+# Reference Word Break rules for intltest rbbi/RBBIMonkeyTest
+#
+# Note: Rule syntax and the monkey test itself are still a work in progress.
+#       They are expected to change with review and the addition of support for rule tailoring.
+
+type = word;      # one of grapheme | word | line | sentence
+locale = en_US_POSIX;
+
+
+CR                 = [\p{Word_Break = CR}];
+LF                 = [\p{Word_Break = LF}];
+Newline            = [\p{Word_Break = Newline}];
+Extend             = [\p{Word_Break = Extend}];
+ZWJ                = [\p{Word_Break = ZWJ}];
+Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
+Format             = [\p{Word_Break = Format}];
+Katakana           = [\p{Word_Break = Katakana}];
+Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
+ALetter            = [\p{Word_Break = ALetter}];
+Single_Quote       = [\p{Word_Break = Single_Quote}];
+Double_Quote       = [\p{Word_Break = Double_Quote}];
+MidNumLet          = [\p{Word_Break = MidNumLet} - [.]];
+MidLetter          = [\p{Word_Break = MidLetter} - [\:]];
+MidNum             = [\p{Word_Break = MidNum} [.]];
+Numeric            = [\p{Word_Break = Numeric}];
+ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
+E_Base             = [\p{Word_Break = EB}];
+E_Modifier         = [\p{Word_Break = EM}];
+EmojiNRK           = [[\p{Emoji}] - [[Regional_Indicator]\u002a\u00230-9©®™〰〽]];
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
+Extended_Pict      = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
+EBG                = [\p{Word_Break = EBG}];
+
+#define dictionary, with the effect being that those characters don't appear in test data.
+
+Han            = [:Han:];
+Hiragana       = [:Hiragana:];
+
+Control        = [\p{Grapheme_Cluster_Break = Control}];
+HangulSyllable = [\uac00-\ud7a3];
+ComplexContext = [:LineBreak = Complex_Context:];
+KanaKanji      = [Han Hiragana Katakana];
+dictionaryCJK  = [KanaKanji HangulSyllable];
+dictionary     = [ComplexContext dictionaryCJK];
+
+# leave dictionary scripts out of ALetter
+
+ALetter        = [ALetter - dictionary];
+
+AHLetter       = [ALetter  Hebrew_Letter];
+MidNumLetQ     = [MidNumLet  Single_Quote];
+ExtFmt         = [Extend Format ZWJ];
+
+WB3:   CR LF;
+WB3a:  (Newline | CR | LF) ÷;
+WB3b:  . ÷ (Newline | CR | LF);   # actually redundant? No other rule combines.
+                                  # (but needed with UAX treat-as scheme.)
+WB3c:   ZWJ (Extended_Pict | EmojiNRK);
+
+WB5:    AHLetter ExtFmt* AHLetter;
+
+# includes both WB6 and WB7
+WB6:    AHLetter ExtFmt* (MidLetter | MidNumLetQ) ExtFmt*  AHLetter;
+
+WB7a:   Hebrew_Letter ExtFmt* Single_Quote;
+WB7b:   Hebrew_Letter ExtFmt* Double_Quote ExtFmt* Hebrew_Letter;   # Include WB7c
+
+WB8:    Numeric ExtFmt* Numeric;
+WB9:    AHLetter ExtFmt* Numeric;
+WB10:   Numeric ExtFmt* AHLetter;
+
+WB11:   Numeric ExtFmt* (MidNum | MidNumLetQ) ExtFmt* Numeric;    # includes WB12
+WB13:   Katakana ExtFmt* Katakana;
+
+WB13a:  (AHLetter | Numeric | Katakana | ExtendNumLet) ExtFmt* ExtendNumLet;
+WB13b:  ExtendNumLet ExtFmt* (AHLetter | Numeric | Katakana);
+
+# WB rule 15 - 17, pairs of Regional Indicators stay unbroken.
+#              Interacts with WB3c.
+WB15:  Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ (Extended_Pict | EmojiNRK);
+WB17:  Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ÷;
+
+WB14:  (E_Base | EBG) ExtFmt* E_Modifier;
+
+# Rule WB 999   Any ÷ Any
+#    Interacts with WB3c, do not break between ZWJ and (Extended_Pict | EBG).
+WB999.1: . ExtFmt* ZWJ (Extended_Pict | EmojiNRK);
+WB999.2: . ExtFmt* ÷;
+
diff --git a/icu4j/main/tests/framework/src/com/ibm/icu/dev/test/TestFmwk.java b/icu4j/main/tests/framework/src/com/ibm/icu/dev/test/TestFmwk.java

index 0c42e4eeac08db216c3ccb9b9aada9a1b262047a..13f4332ec5ba658a039490c33760601a08e441fb 100644 (file)
--- a/icu4j/main/tests/framework/src/com/ibm/icu/dev/test/TestFmwk.java
+++ b/icu4j/main/tests/framework/src/com/ibm/icu/dev/test/TestFmwk.java
@@ -117,6 +117,38 @@ abstract public class TestFmwk extends AbstractTestLog {
          return new Random(getParams().getSeed());
      }
  
+    /**
+     * Integer Random number generator, produces positive int values.
+     * Similar to C++ std::minstd_rand, with the same algorithm & constants.
+     * Provided for compatibility with ICU4C.
+     * Get & set of the seed allows for reproducible monkey tests.
+     */
+    protected class ICU_Rand {
+        private int fLast;
+
+        public ICU_Rand(int seed) {
+            seed(seed);
+        }
+
+        public int next() {
+            fLast = (int)((fLast * 48271L) % 2147483647L);
+            return fLast;
+        }
+
+        public void seed(int seed) {
+            if (seed <= 0) {
+                seed = 1;
+            }
+            seed %= 2147483647;   // = 0x7FFFFFFF
+            fLast = seed > 0 ? seed : 1;
+        }
+
+        public int getSeed() {
+            return fLast;
+        }
+
+    }
+
      static final String ICU_TRAC_URL = "http://bugs.icu-project.org/trac/ticket/";
      static final String CLDR_TRAC_URL = "http://unicode.org/cldr/trac/ticket/";
      static final String CLDR_TICKET_PREFIX = "cldrbug:";
author	Andy Heninger <andy.heninger@gmail.com>
	Tue, 1 Aug 2017 01:03:09 +0000 (01:03 +0000)
committer	Andy Heninger <andy.heninger@gmail.com>
	Tue, 1 Aug 2017 01:03:09 +0000 (01:03 +0000)
icu4c/source/test/intltest/rbbimonkeytest.cpp		patch \| blob \| history
icu4c/source/test/testdata/break_rules/line.txt		patch \| blob \| history
icu4c/source/test/testdata/break_rules/readme.txt		patch \| blob \| history
icu4c/source/test/testdata/break_rules/word.txt		patch \| blob \| history
icu4c/source/test/testdata/break_rules/word_POSIX.txt		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBIMonkeyTest.java	[new file with mode: 0644]	patch \| blob
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/grapheme.txt	[new file with mode: 0644]	patch \| blob
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt	[new file with mode: 0644]	patch \| blob
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt	[new file with mode: 0644]	patch \| blob
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt	[new file with mode: 0644]	patch \| blob
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt	[new file with mode: 0644]	patch \| blob
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt	[new file with mode: 0644]	patch \| blob
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/readme.txt	[new file with mode: 0644]	patch \| blob
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/sentence.txt	[new file with mode: 0644]	patch \| blob
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/word.txt	[new file with mode: 0644]	patch \| blob
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/word_POSIX.txt	[new file with mode: 0644]	patch \| blob
icu4j/main/tests/framework/src/com/ibm/icu/dev/test/TestFmwk.java		patch \| blob \| history