]> granicus.if.org Git - icu/commitdiff
ICU-13058 Break iteration tests & rules update for new prop data. Tests are now passi...
authorAndy Heninger <andy.heninger@gmail.com>
Fri, 24 Mar 2017 01:31:00 +0000 (01:31 +0000)
committerAndy Heninger <andy.heninger@gmail.com>
Fri, 24 Mar 2017 01:31:00 +0000 (01:31 +0000)
X-SVN-Rev: 39922

icu4c/source/data/brkitr/rules/char.txt
icu4c/source/data/brkitr/rules/word.txt
icu4c/source/test/intltest/rbbitst.cpp
icu4c/source/test/testdata/break_rules/grapheme.txt
icu4c/source/test/testdata/break_rules/word.txt

index 24c8b5f272e0d4c4e9292faf332b0aa5c1f5f9c0..77572f5cd687e53959b033402879bff4a6ec5a99 100644 (file)
@@ -9,7 +9,8 @@
 #   ICU Character Break Rules, also known as Grapheme Cluster Boundaries
 #      See Unicode Standard Annex #29.
 #      These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
-#      plus proposed updates for Emoji 4.0 from https://goo.gl/cluFCn
+#      Plus revisions to rule GB 11 from http://unicode.org/cldr/trac/ticket/10088
+#      Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
 
 #
 #  Character Class Definitions.
@@ -35,7 +36,7 @@ $LVT         = [\p{Grapheme_Cluster_Break = LVT}];
 
 # Emoji defintions
 
-$E_Base      = [[\p{Grapheme_Cluster_Break = EB}] \U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
+$E_Base      = [\p{Grapheme_Cluster_Break = EB}];
 $E_Modifier  = [\p{Grapheme_Cluster_Break = EM}];
 
 # Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
index f2c98e054bcccab7b6fb888b13fac3272f5d80b8..742d8f8fe31e57719cb33b5f609f55025e5ceae9 100644 (file)
@@ -10,6 +10,7 @@
 #      See Unicode Standard Annex #29.
 #      These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
 #      with additions for Emoji Sequences from https://goo.gl/cluFCn
+#      Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
 #
 # Note:  Updates to word.txt will usually need to be merged into
 #        word_POSIX.txt also.
@@ -44,7 +45,7 @@ $MidLetter          = [\p{Word_Break = MidLetter}];
 $MidNum             = [\p{Word_Break = MidNum}];
 $Numeric            = [\p{Word_Break = Numeric}];
 $ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
-$E_Base             = [\p{Word_Break = EB}\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
+$E_Base             = [\p{Word_Break = EB}];
 $E_Modifier         = [\p{Word_Break = EM}];
 
 # Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
index 6bbeeebdef23ad7b622e349f48e48ade9ba2aa9e..f13481a82c3dfa844bfeb4f196df0fa64f9b3de6 100644 (file)
@@ -1683,25 +1683,30 @@ void RBBITest::TestUnicodeFiles() {
 // See ticket #7270.
 
 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
-    static const UChar *badTestCases[] = {                     // Line Numbers from Unicode 7.0.0 file.
-        u"\u200B\u0020}",   // Line 5198
-        u"\u200B\u0020)",   // Line 5202
-        u"\u200B\u0020!",   // Line 5214
-        u"\u200B\u0020,",   // Line 5246
-        u"\u200B\u0020/",   // Line 5298
-        u"\u200B\u0020\u2060"    // Line 5302
+    static struct TestCase {
+        const char *fFileName;
+        const UChar *fString;
+    } badTestCases[] = {                                // Line Numbers from Unicode 7.0.0 file.
+        {"LineBreakTest.txt", u"\u200B\u0020}"},        // Line 5198
+        {"LineBreakTest.txt", u"\u200B\u0020)"},        // Line 5202
+        {"LineBreakTest.txt", u"\u200B\u0020!"},        // Line 5214
+        {"LineBreakTest.txt", u"\u200B\u0020,"},        // Line 5246
+        {"LineBreakTest.txt", u"\u200B\u0020/"},        // Line 5298
+        {"LineBreakTest.txt", u"\u200B\u0020\u2060"},   // Line 5302
+                                                        // Line Numbers from pre-release verion of GraphemeBreakTest-10.0.0.txt
+        {"GraphemeBreakTest.txt", u"\u200D\u2640"},     // Line 656, old GB 11 test ZWJ x GAZ
+        {"GraphemeBreakTest.txt", u"\u200D\U0001F466"}, // Line 658, old GB 11 test ZWJ x EBG
+        {"GraphemeBreakTest.txt", u"\u200D\U0001F466\U0001F3FB"}, // Line 842, old GB 11 test ZWJ x EBG x EModifier
+
+                                                        // Line Numbers from pre-release verion of WordBreakTest-10.0.0.txt
+        {"WordBreakTest.txt", u"\u200D\u261D"},         // Line 1356, ZWJ x EmojiNRK
+        {"WordBreakTest.txt", u"\u200D\U0001F3FB"},     // Line 1358, ZWJ x EmojiNRK
     };
-    if (strcmp(fileName, "LineBreakTest.txt") != 0) {
-        return FALSE;
-    }
 
-#if ((U_PLATFORM == U_PF_OS390) || (U_PLATFORM == U_PF_AIX)) && (U_CPLUSPLUS_VERSION < 11)
     for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
-      const UChar *badCase = badTestCases[n];
-#else
-      for (const UChar *badCase: badTestCases) {
-#endif
-        if (testCase == UnicodeString(badCase)) {
+        const TestCase &badCase = badTestCases[n];
+        if (!strcmp(fileName, badCase.fFileName) &&
+                testCase == UnicodeString(badCase.fString)) {
             return logKnownIssue("7270");
         }
     }
@@ -2043,7 +2048,7 @@ RBBICharMonkey::RBBICharMonkey() {
     fHangulSet->addAll(*fLVSet);
     fHangulSet->addAll(*fLVTSet);
 
-    fEmojiBaseSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
+    fEmojiBaseSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EB}]"), status);
     fEmojiModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EM}]"), status);
     fExtendedPictSet  = new UnicodeSet(gExtended_Pict, status);
     fEBGSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EBG}]"), status);
@@ -2325,8 +2330,7 @@ RBBIWordMonkey::RBBIWordMonkey()
     fExtendNumLetSet  = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
     fExtendSet        = new UnicodeSet(u"[\\p{Word_Break = Extend}]",       status);
 
-    fEBaseSet         = new UnicodeSet(
-            u"[\\p{Word_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]", status);
+    fEBaseSet         = new UnicodeSet(u"[\\p{Word_Break = EB}]",           status);
     fEBGSet           = new UnicodeSet(u"[\\p{Word_Break = EBG}]",          status);
     fEModifierSet     = new UnicodeSet(u"[\\p{Word_Break = EM}]",           status);
     fZWJSet           = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]",          status);
@@ -4757,32 +4761,21 @@ void RBBITest::TestEmoji() {
                 break;
             }
         }
-        if (testString.length() > 1) {
-            charBreaks->setText(testString);
-            charBreaks->first();
-            int32_t firstBreak = charBreaks->next();
-            if (testString.length() != firstBreak) {
-                if (logKnownIssue("13058", "%s:%d", __FILE__, __LINE__)) {
-                    continue;
+        // Local function check()
+        auto check = [=](const char *breakType, BreakIterator *bi) -> void {
+            if (testString.length() > 1) {
+                bi->setText(testString);
+                bi->first();
+                int32_t firstBreak = bi->next();
+                if (testString.length() != firstBreak) {
+                    errln("%s:%d checking %s. emoji-test.txt:%d Error, uexpected break at offset %d",
+                            __FILE__, __LINE__, breakType, lineNumber, firstBreak);
                 }
-                errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
-                        __FILE__, __LINE__, lineNumber, firstBreak);
-            }
-            wordBreaks->setText(testString);
-            wordBreaks->first();
-            firstBreak = wordBreaks->next();
-            if (testString.length() != firstBreak) {
-                errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
-                        __FILE__, __LINE__, lineNumber, firstBreak);
             }
-            lineBreaks->setText(testString);
-            lineBreaks->first();
-            firstBreak = lineBreaks->next();
-            if (testString.length() != firstBreak) {
-                errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
-                        __FILE__, __LINE__, lineNumber, firstBreak);
-            }
-        }
+        };
+        check("charBreaks", charBreaks.getAlias());
+        check("wordBreaks", wordBreaks.getAlias());
+        check("lineBreaks", lineBreaks.getAlias());
     }
 }
 
index 3e9b9a4f05e3e76fa65370bb9f40d1164f11716e..0b551ba1b3b1d1dd4f7ad708d09bccf6b6406d59 100644 (file)
@@ -36,12 +36,12 @@ LVT                = [\p{Grapheme_Cluster_Break = LVT}];
 # Emoji defintions
 
 EmojiNRK           = [[\p{Emoji}] - [Regional_Indicator\u002a\u00230-9©®™〰〽]];
-E_Base             = [\p{Grapheme_Cluster_Break = EB}\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
+E_Base             = [\p{Grapheme_Cluster_Break = EB}];
 E_Modifier         = [\p{Grapheme_Cluster_Break = EM}];
 E_Base_GAZ         = [\p{Grapheme_Cluster_Break = EBG}];
 
-# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
-Extended_Pict         = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
+Extended_Pict         = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
 
 
 GB3:     CR LF;
index 2f24d6e20e524c971929712158104913a4e79388..783dfc9201aa7902d526bff2c470ad41cb974c0e 100644 (file)
@@ -32,11 +32,11 @@ MidLetter          = [\p{Word_Break = MidLetter}];
 MidNum             = [\p{Word_Break = MidNum}];
 Numeric            = [\p{Word_Break = Numeric}];
 ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
-E_Base             = [\p{Word_Break = EB}\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
+E_Base             = [\p{Word_Break = EB}];
 E_Modifier         = [\p{Word_Break = EM}];
 EmojiNRK           = [[\p{Emoji}] - [[Regional_Indicator]\u002a\u00230-9©®™〰〽]];
-# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
-Extended_Pict      = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
+Extended_Pict      = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
 EBG                = [\p{Word_Break = EBG}];
 
 #define dicitionary, with the effect being that those characters don't appear in test data.