ICU-13274 rbbi, port updated TestExtended to Java.

author Andy Heninger <andy.heninger@gmail.com>

Thu, 3 Aug 2017 20:31:06 +0000 (20:31 +0000)

committer Andy Heninger <andy.heninger@gmail.com>

Thu, 3 Aug 2017 20:31:06 +0000 (20:31 +0000)
author Andy Heninger <andy.heninger@gmail.com>
Thu, 3 Aug 2017 20:31:06 +0000 (20:31 +0000)
committer Andy Heninger <andy.heninger@gmail.com>
Thu, 3 Aug 2017 20:31:06 +0000 (20:31 +0000)
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java

index 555450308657fb41c7ff21ed8da23ecaaaa4f230..b77dd77ab331552ecb5bcdcdf7ba629a4eee06d7 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java
@@ -20,7 +20,7 @@ import com.ibm.icu.dev.test.TestFmwk;
  import com.ibm.icu.impl.Utility;
  import com.ibm.icu.lang.UCharacter;
  import com.ibm.icu.text.BreakIterator;
-import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.RuleBasedBreakIterator;
  import com.ibm.icu.util.ULocale;
  
  
@@ -39,7 +39,7 @@ public RBBITestExtended() {
  
  static class TestParams {
      BreakIterator   bi;
-    StringBuffer    dataToBreak    = new StringBuffer();
+    StringBuilder   dataToBreak    = new StringBuilder();
      int[]           expectedBreaks = new int[1000];
      int[]           srcLine        = new int[1000];
      int[]           srcCol         = new int[1000];
@@ -55,7 +55,7 @@ public void TestExtended() {
      //
      //  Open and read the test data file.
      //
-    StringBuffer testFileBuf = new StringBuffer();
+    StringBuilder testFileBuf = new StringBuilder();
      InputStream is = null;
      try {
          is = RBBITestExtended.class.getResourceAsStream("rbbitst.txt");
@@ -78,7 +78,7 @@ public void TestExtended() {
                      continue;
                  }
  
-                UTF16.append(testFileBuf, c);
+                testFileBuf.appendCodePoint(c);
              }
          } finally {
              isr.close();
@@ -99,20 +99,12 @@ public void TestExtended() {
      final int  PARSE_TAG     = 2;
      final int  PARSE_DATA    = 3;
      final int  PARSE_NUM     = 4;
+    final int  PARSE_RULES   = 5;
  
      int parseState = PARSE_TAG;
  
      int savedState = PARSE_TAG;
  
-    final char CH_LF        = 0x0a;
-    final char CH_CR        = 0x0d;
-    final char CH_HASH      = 0x23;
-    /*static const UChar CH_PERIOD    = 0x2e;*/
-    final char CH_LT        = 0x3c;
-    final char CH_GT        = 0x3e;
-    final char CH_BACKSLASH = 0x5c;
-    final char CH_BULLET    = 0x2022;
-
      int    lineNum  = 1;
      int    colStart = 0;
      int    column   = 0;
@@ -120,17 +112,21 @@ public void TestExtended() {
      int    i;
  
      int    tagValue = 0;       // The numeric value of a <nnn> tag.
+
+    StringBuilder   rules = new StringBuilder();     // Holds rules from a <rules> ... </rules> block
+    int             rulesFirstLine = 0;              // Line number of the start of current <rules> block
+
      int    len = testString.length();
  
      for (charIdx = 0; charIdx < len; ) {
-        int  c = UTF16.charAt(testString, charIdx);
+        int c = testString.codePointAt(charIdx);
          charIdx++;
-        if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
+        if (c == '\r' && charIdx<len && testString.charAt(charIdx) == '\n') {
              // treat CRLF as a unit
-            c = CH_LF;
+            c = '\n';
              charIdx++;
          }
-        if (c == CH_LF || c == CH_CR) {
+        if (c == '\n' || c == '\r') {
              lineNum++;
              colStart = charIdx;
          }
@@ -145,7 +141,7 @@ public void TestExtended() {
  
          case PARSE_TAG:
              {
-            if (c == CH_HASH) {
+            if (c == '#') {
                  parseState = PARSE_COMMENT;
                  savedState = PARSE_TAG;
                  break;
@@ -178,6 +174,15 @@ public void TestExtended() {
                  charIdx += 6;
                  break;
              }
+            if (testString.startsWith("<rules>", charIdx-1) ||
+                    testString.startsWith("<badrules>", charIdx-1)) {
+                charIdx = testString.indexOf('>', charIdx) + 1;
+                parseState = PARSE_RULES;
+                rules.setLength(0);
+                rulesFirstLine = lineNum;
+                break;
+            }
+
              if (testString.startsWith("<locale ", charIdx-1)) {
                  int closeIndex = testString.indexOf(">", charIdx);
                  if (closeIndex < 0) {
@@ -206,8 +211,36 @@ public void TestExtended() {
              //savedState = PARSE_DATA;
              }
  
+        case PARSE_RULES:
+            if (testString.startsWith("</rules>", charIdx-1)) {
+                charIdx += 7;
+                parseState = PARSE_TAG;
+                try {
+                    tp.bi = new RuleBasedBreakIterator(rules.toString());
+                } catch (IllegalArgumentException e) {
+                    errln(String.format("rbbitst.txt:%d  Error creating break iterator from rules.  %s", lineNum, e));
+                }
+            } else if (testString.startsWith("</badrules>", charIdx-1)) {
+                charIdx += 10;
+                parseState = PARSE_TAG;
+                boolean goodRules = true;
+                try {
+                    new RuleBasedBreakIterator(rules.toString());
+                } catch (IllegalArgumentException e) {
+                    goodRules = false;
+                }
+                if (goodRules) {
+                    errln(String.format(
+                            "rbbitst.txt:%d  Expected, but did not get, a failure creating break iterator from rules.",
+                            lineNum));
+                }
+            } else {
+                rules.appendCodePoint(c);
+            }
+            break;
+
          case PARSE_DATA:
-            if (c == CH_BULLET) {
+            if (c == '•') {
                  int  breakIdx = tp.dataToBreak.length();
                  tp.expectedBreaks[breakIdx] = -1;
                  tp.srcLine[breakIdx]        = lineNum;
@@ -247,7 +280,7 @@ public void TestExtended() {
                  } else {
                      // Named code point was recognized.  Insert it
                      //   into the test data.
-                    UTF16.append(tp.dataToBreak, c);
+                    tp.dataToBreak.appendCodePoint(c);
                      for (i = tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
                          tp.srcLine[i] = lineNum;
                          tp.srcCol[i]  = column;
@@ -269,28 +302,28 @@ public void TestExtended() {
                  break;
              }
  
-            if (c == CH_LT) {
+            if (c == '<') {
                  tagValue   = 0;
                  parseState = PARSE_NUM;
                  break;
              }
  
-            if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
+            if (c == '#' && column==3) {   // TODO:  why is column off so far?
                  parseState = PARSE_COMMENT;
                  savedState = PARSE_DATA;
                  break;
              }
  
-            if (c == CH_BACKSLASH) {
+            if (c == '\\') {
                  // Check for \ at end of line, a line continuation.
                  //     Advance over (discard) the newline
-                int cp = UTF16.charAt(testString, charIdx);
-                if (cp == CH_CR && charIdx<len && UTF16.charAt(testString, charIdx+1) == CH_LF) {
+                int cp = testString.codePointAt(charIdx);
+                if (cp == '\r' && charIdx<len && testString.codePointAt(charIdx+1) == '\n') {
                      // We have a CR LF
                      //  Need an extra increment of the input ptr to move over both of them
                      charIdx++;
                  }
-                if (cp == CH_LF || cp == CH_CR) {
+                if (cp == '\n' || cp == '\r') {
                      lineNum++;
                      column   = 0;
                      charIdx++;
@@ -306,7 +339,7 @@ public void TestExtended() {
                      // Escape sequence was recognized.  Insert the char
                      //   into the test data.
                      charIdx = charIdxAr[0];
-                    UTF16.append(tp.dataToBreak, cp);
+                    tp.dataToBreak.appendCodePoint(cp);
                      for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
                          tp.srcLine[i] = lineNum;
                          tp.srcCol[i]  = column;
@@ -319,12 +352,12 @@ public void TestExtended() {
                  // Not a recognized backslash escape sequence.
                  // Take the next char as a literal.
                  //  TODO:  Should this be an error?
-                c = UTF16.charAt(testString,charIdx);
-                charIdx = UTF16.moveCodePointOffset(testString, charIdx, 1);
+                c = testString.codePointAt(charIdx);
+                charIdx = testString.offsetByCodePoints(charIdx, 1);
               }
  
              // Normal, non-escaped data char.
-            UTF16.append(tp.dataToBreak, c);
+            tp.dataToBreak.appendCodePoint(c);
  
              // Save the mapping from offset in the data to line/column numbers in
              //   the original input file.  Will be used for better error messages only.
@@ -344,7 +377,7 @@ public void TestExtended() {
                  break;
              }
  
-            if (c == CH_GT) {
+            if (c == '>') {
                  // Finished the number.  Add the info to the expected break data,
                  //   and switch parse state back to doing plain data.
                  parseState = PARSE_DATA;
@@ -363,15 +396,20 @@ public void TestExtended() {
                  break;
              }
  
-            errln("Syntax Error in test file at line "+ lineNum +", col %d" + column);
+            errln(String.format("Syntax Error in rbbitst.txt at line %d, col %d", lineNum, column));
              return;
-
-            // parseState = PARSE_COMMENT;   // TODO: unreachable.  Don't stop on errors.
-            // break;
          }
+    }
  
+    // Reached end of test file. Raise an error if parseState indicates that we are
+    //   within a block that should have been terminated.
  
-
+    if (parseState == PARSE_RULES) {
+        errln(String.format("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
+            lineNum, rulesFirstLine));
+    }
+    if (parseState == PARSE_DATA) {
+        errln(String.format("rbbitst.txt:%d <data> block not closed.", lineNum));
      }
  }
  
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt

index f9e97ab4fb15fb0d078d2e34acc20b4f44fb0567..f07107bdfb03d3808d8c000bdd4bcd7350fc762e 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
@@ -14,7 +14,9 @@
  #   <sent>    any following data is for sentence break testing
  #   <line>    any following data is for line break testing
  #   <char>    any following data is for char break testing
-#   <locale local_name>  Switch to the named locale at the next occurence of <word>, <sent>, etc.
+#   <rules> rules ... </rules>  following data is tested against these rules.
+#                               Applies until a following occurence of <word>, <sent>, etc. or another <rules>
+#   <locale locale_name>  Switch to the named locale at the next occurence of <word>, <sent>, etc.
  #   <data> ... </data>   test data.  May span multiple lines.
  #      <>        Break position, status == 0
  #      •         Break position, status == 0   (Bullet, \u2022)
@@ -37,8 +39,17 @@
  #   Temp debugging tests
  <locale en>
  <word>
-<data><0>ク<400>ライアン<400>ト<400>サーバー<400></data>
-# <data><0>ク<400>ライアン<400>トサーバー<400></data>
+<data><0>コンピューター<400>は<400>、<0>本質<400>的<400>に<400>は<400>数字<400>しか<400>扱う<400>こと<400>が<400>でき<400>ま<400>せん<400>。<0>\
+コンピューター<400>は<400>、<0>文字<400>や<400>記号<400>など<400>の<400>それぞれに<400>番号<400>を<400>割り振る<400>こと<400>によって<400>扱える<400>\
+よう<400>にし<400>ます<400>。<0>ユニ<400>コード<400>が<400>出来る<400>まで<400>は<400>、<0>これらの<400>番号<400>を<400>割り振る<400>仕組み<400>が<400>\
+何<400>百<400>種類<400>も<400>存在<400>しま<400>した<400>。<0>どの<400>一つ<400>を<400>とっても<400>、<0>十分<400>な<400>文字<400>を<400>含<400>\
+んで<400>は<400>いま<400>せん<400>で<400>した<400>。<0>例えば<400>、<0>欧州<400>連合<400>一つ<400>を<400>見<400>て<400>も<400>、<0>その<400>\
+すべて<400>の<400>言語<400>を<400>カバー<400>する<400>ため<400>に<400>は<400>、<0>いくつか<400>の<400>異なる<400>符号<400>化<400>の<400>仕組み<400>\
+が<400>必要<400>で<400>した<400>。<0>英語<400>の<400>よう<400>な<400>一つ<400>の<400>言語<400>に<400>限<400>って<400>も<400>、<0>一つ<400>だけ<400>\
+の<400>符号<400>化<400>の<400>仕組み<400>では<400>、<0>一般<400>的<400>に<400>使<400>われる<400>すべて<400>の<400>文字<400>、<0>句読点<400>、<0>\
+。<0></data>
+
+#<data><0>コンピューター<400>は<400>、<0>本質<400>的<400>に<400>は<400>数字<400>しか<400>扱う<400>こと<400>が<400>でき<400>ま<400>せん<400>。<0>\
  
  ## FILTERED BREAK TESTS
  
@@ -1308,3 +1319,48 @@ Bangkok)•</data>
  <data>•\U0001F468\u200D\u2695\uFE0F•\U0001F468\u200D\u2695•\U0001F468\U0001F3FD\u200D\u2695\uFE0F•\U0001F468\U0001F3FD\u200D\u2695\u0020•</data>
  # woman astronaut, woman astronaut / fitz4
  <data>•\U0001F469\u200D\U0001F680•\U0001F469\U0001F3FD\u200D\U0001F680\u0020•</data>
+
+
+####################################################################################
+#
+#  Test rule status values
+#
+####################################################################################
+<rules> $Letters = [:L:];
+    $Numbers = [:N:];
+    $Letters+{1};
+    $Numbers+{2};
+    Help\ me\!{4};
+    [^$Letters $Numbers];
+    !.*;
+</rules>
+<data>•abc<1>123<2>.•.•abc<1> •Help<1> •me<1> •Help me!<4></data>
+
+# Test option to prohibit unquoted literals.
+
+<rules>
+!!forward;
+    Hello\ World;
+!!reverse;
+    .*;
+</rules>
+<data>•Hello World•</data>
+
+<badrules>
+!!quoted_literals_only;
+!!forward;
+    Hello\ World;
+!!reverse;
+    .*;
+</badrules>
+
+<rules>
+#TODO: uncomment this line when quoted_literals_only is implemented.
+#!!quoted_literals_only;
+!!forward;
+    'Hello World';
+!!reverse;
+    .*;
+</rules>
+<data>•Hello World•</data>
+
author	Andy Heninger <andy.heninger@gmail.com>
	Thu, 3 Aug 2017 20:31:06 +0000 (20:31 +0000)
committer	Andy Heninger <andy.heninger@gmail.com>
	Thu, 3 Aug 2017 20:31:06 +0000 (20:31 +0000)
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt		patch \| blob \| history