ICU-12677 RBBI, fix incorrect stripping of comments from saved rules.

author Andy Heninger <andy.heninger@gmail.com>

Sat, 3 Feb 2018 19:10:50 +0000 (19:10 +0000)

committer Andy Heninger <andy.heninger@gmail.com>

Sat, 3 Feb 2018 19:10:50 +0000 (19:10 +0000)
author Andy Heninger <andy.heninger@gmail.com>
Sat, 3 Feb 2018 19:10:50 +0000 (19:10 +0000)
committer Andy Heninger <andy.heninger@gmail.com>
Sat, 3 Feb 2018 19:10:50 +0000 (19:10 +0000)
diff --git a/icu4c/source/common/rbbirb.cpp b/icu4c/source/common/rbbirb.cpp

index 72447d88f00aa4c04e8501d5b565198ec5aa70cf..84dfd4b58cfaebe62b38fc400416c669dc346a15 100644 (file)
--- a/icu4c/source/common/rbbirb.cpp
+++ b/icu4c/source/common/rbbirb.cpp
@@ -47,7 +47,7 @@ U_NAMESPACE_BEGIN
  RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString   &rules,
                                         UParseError     *parseErr,
                                         UErrorCode      &status)
- : fRules(rules)
+ : fRules(rules), fStrippedRules(rules)
  {
      fStatus = &status; // status is checked below
      fParseError = parseErr;
@@ -147,8 +147,9 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
          return NULL;
      }
  
-    // Remove comments and whitespace from the rules to make it smaller.
-    UnicodeString strippedRules((const UnicodeString&)RBBIRuleScanner::stripRules(fRules));
+    // Remove whitespace from the rules to make it smaller.
+    // The rule parser has already removed comments.
+    fStrippedRules = fScanner->stripRules(fStrippedRules);
  
      // Calculate the size of each section in the data.
      //   Sizes here are padded up to a multiple of 8 for better memory alignment.
@@ -162,7 +163,7 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
      int32_t safeRevTableSize  = align8(fSafeRevTables->getTableSize());
      int32_t trieSize          = align8(fSetBuilder->getTrieSize());
      int32_t statusTableSize   = align8(fRuleStatusVals->size() * sizeof(int32_t));
-    int32_t rulesSize         = align8((strippedRules.length()+1) * sizeof(UChar));
+    int32_t rulesSize         = align8((fStrippedRules.length()+1) * sizeof(UChar));
  
      (void)safeFwdTableSize;
  
@@ -225,7 +226,7 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
      data->fStatusTable   = data->fTrie    + trieSize;
      data->fStatusTableLen= statusTableSize;
      data->fRuleSource    = data->fStatusTable + statusTableSize;
-    data->fRuleSourceLen = strippedRules.length() * sizeof(UChar);
+    data->fRuleSourceLen = fStrippedRules.length() * sizeof(UChar);
  
      uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
  
@@ -245,7 +246,7 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
          ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
      }
  
-    strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
+    fStrippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
  
      return data;
  }
diff --git a/icu4c/source/common/rbbirb.h b/icu4c/source/common/rbbirb.h

index f00f58e5dff3fb947f16f41e4e2a53eca4a40b26..0c307577e362b356f40065df882423af5b139e44 100644 (file)
--- a/icu4c/source/common/rbbirb.h
+++ b/icu4c/source/common/rbbirb.h
@@ -130,6 +130,7 @@ public:
      UErrorCode                    *fStatus;          // Error reporting.  Keeping status
      UParseError                   *fParseError;      //   here avoids passing it everywhere.
      const UnicodeString           &fRules;           // The rule string that we are compiling
+    UnicodeString                 fStrippedRules;    // The rule string, with comments stripped.
  
      RBBIRuleScanner               *fScanner;         // The scanner.
      RBBINode                      *fForwardTree;     // The parse trees, generated by the scanner,
diff --git a/icu4c/source/common/rbbiscan.cpp b/icu4c/source/common/rbbiscan.cpp

index db0d7614238b918daf42e60e48de57c7a104981d..49a58068a96f03f6115fb9ecb1c6ea7748019ba8 100644 (file)
--- a/icu4c/source/common/rbbiscan.cpp
+++ b/icu4c/source/common/rbbiscan.cpp
@@ -822,27 +822,24 @@ static const UChar      chRParen    = 0x29;
  
  //------------------------------------------------------------------------------
  //
-//  stripRules    Return a rules string without unnecessary
-//                characters.
+//  stripRules    Return a rules string without extra spaces.
+//                (Comments are removed separately, during rule parsing.)
  //
  //------------------------------------------------------------------------------
  UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {
      UnicodeString strippedRules;
-    int rulesLength = rules.length();
-    for (int idx = 0; idx < rulesLength; ) {
-        UChar ch = rules[idx++];
-        if (ch == chPound) {
-            while (idx < rulesLength
-                && ch != chCR && ch != chLF && ch != chNEL)
-            {
-                ch = rules[idx++];
-            }
-        }
-        if (!u_isISOControl(ch)) {
-            strippedRules.append(ch);
+    int32_t rulesLength = rules.length();
+    bool skippingSpaces = false;
+    
+    for (int32_t idx=0; idx<rulesLength; idx = rules.moveIndex32(idx, 1)) {
+        UChar32 cp = rules.char32At(idx);
+        bool whiteSpace = u_hasBinaryProperty(cp, UCHAR_PATTERN_WHITE_SPACE);
+        if (skippingSpaces && whiteSpace) {
+            continue;
          }
+        strippedRules.append(cp);
+        skippingSpaces = whiteSpace;
      }
-    // strippedRules = strippedRules.unescape();
      return strippedRules;
  }
  
@@ -942,6 +939,7 @@ void RBBIRuleScanner::nextChar(RBBIRuleChar &c) {
              //  It will be treated as white-space, and serves to break up anything
              //    that might otherwise incorrectly clump together with a comment in
              //    the middle (a variable name, for example.)
+            int32_t commentStart = fScanIndex;
              for (;;) {
                  c.fChar = nextCharLL();
                  if (c.fChar == (UChar32)-1 ||  // EOF
@@ -950,6 +948,9 @@ void RBBIRuleScanner::nextChar(RBBIRuleChar &c) {
                      c.fChar == chNEL    ||
                      c.fChar == chLS)       {break;}
              }
+            for (int32_t i=commentStart; i<fNextIndex-1; ++i) {
+                fRB->fStrippedRules.setCharAt(i, u' ');
+            }
          }
          if (c.fChar == (UChar32)-1) {
              return;
diff --git a/icu4c/source/test/intltest/rbbiapts.cpp b/icu4c/source/test/intltest/rbbiapts.cpp

index 1a872907595dd90bfc504f881814063966eb3f5e..fa6ea066807a800caec4b46e7adeaedbae4c4c1c 100644 (file)
--- a/icu4c/source/test/intltest/rbbiapts.cpp
+++ b/icu4c/source/test/intltest/rbbiapts.cpp
@@ -1035,7 +1035,7 @@ void RBBIAPITest::RoundtripRule(const char *dataFile) {
  
      builtRules = (const uint8_t *)udata_getMemory(data.getAlias());
      builtSource = (const UChar *)(builtRules + ((RBBIDataHeader*)builtRules)->fRuleSource);
-    RuleBasedBreakIterator *brkItr = new RuleBasedBreakIterator(builtSource, parseError, status);
+    LocalPointer<RuleBasedBreakIterator> brkItr (new RuleBasedBreakIterator(builtSource, parseError, status));
      if (U_FAILURE(status)) {
          errln("%s:%d createRuleBasedBreakIterator: ICU Error \"%s\"  at line %d, column %d\n",
                  __FILE__, __LINE__, u_errorName(status), parseError.line, parseError.offset);
@@ -1048,7 +1048,6 @@ void RBBIAPITest::RoundtripRule(const char *dataFile) {
          errln("%s:%d Built rules and rebuilt rules are different %s", __FILE__, __LINE__, dataFile);
          return;
      }
-    delete brkItr;
  }
  
  void RBBIAPITest::TestRoundtripRules() {
diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp

index 1d9091e5dc0450dfbda5c4a71f2fe7318d1115ea..98b95bb7e2dd00fef18c184c1759f9c930e4fb80 100644 (file)
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@@ -105,6 +105,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
      TESTCASE_AUTO(TestBug12932);
      TESTCASE_AUTO(TestEmoji);
      TESTCASE_AUTO(TestBug12519);
+    TESTCASE_AUTO(TestBug12677);
      TESTCASE_AUTO_END;
  }
  
@@ -4436,6 +4437,23 @@ void RBBITest::TestBug12519() {
      assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
  }
  
+void RBBITest::TestBug12677() {
+    // Check that stripping of comments from rules for getRules() is not confused by
+    // the presence of '#' characters in the rules that do not introduce comments.
+    UnicodeString rules(u"!!forward; \n"
+                         "$x = [ab#];  # a set with a # literal. \n"
+                         " # .;        # a comment that looks sort of like a rule.   \n"
+                         " '#' '?';    # a rule with a quoted #   \n"
+                       );
+    
+    UErrorCode status = U_ZERO_ERROR;
+    UParseError pe;
+    RuleBasedBreakIterator bi (rules, pe, status);
+    assertSuccess(WHERE, status);
+    UnicodeString rtRules = bi.getRules();
+    assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "),  rtRules); 
+}
+
  //
  //  TestDebug    -  A place-holder test for debugging purposes.
  //                  For putting in fragments of other tests that can be invoked
diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h

index 597025c28e33c16a627f534f66ebab243402717d..71febf1cebfe758c491d97c09cf42f15030e3402 100644 (file)
--- a/icu4c/source/test/intltest/rbbitst.h
+++ b/icu4c/source/test/intltest/rbbitst.h
@@ -74,6 +74,7 @@ public:
      void TestBug12932();
      void TestEmoji();
      void TestBug12519();
+    void TestBug12677();
  
      void TestDebug();
      void TestProperties();
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java

index 6236a308c16175fe267c1bec9511f4fca6bc880b..c3bdaa23b72c54cf0933ae8190b65686b966c1a3 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java
@@ -28,6 +28,7 @@ class RBBIRuleBuilder {
  
      String fDebugEnv;              // controls debug trace output
      String fRules;                 // The rule string that we are compiling
+    StringBuilder fStrippedRules;  // The rule string, with comments stripped.
      RBBIRuleScanner fScanner;      // The scanner.
  
  
@@ -142,6 +143,7 @@ class RBBIRuleBuilder {
          fDebugEnv       = ICUDebug.enabled("rbbi") ?
                              ICUDebug.value("rbbi") : null;
          fRules          = rules;
+        fStrippedRules  = new StringBuilder(rules);
          fUSetNodes      = new ArrayList<RBBINode>();
          fRuleStatusVals = new ArrayList<Integer>();
          fScanner        = new RBBIRuleScanner(this);
@@ -165,8 +167,9 @@ class RBBIRuleBuilder {
          DataOutputStream dos = new DataOutputStream(os);
          int i;
  
-        //  Remove comments and whitespace from the rules to make it smaller.
-        String strippedRules = RBBIRuleScanner.stripRules(fRules);
+        //  Remove whitespace from the rules to make it smaller.
+        //  The rule parser has already removed comments.
+        String strippedRules = RBBIRuleScanner.stripRules(fStrippedRules.toString());
  
          // Calculate the size of each section in the data in bytes.
          //   Sizes here are padded up to a multiple of 8 for better memory alignment.
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java

index 1fb6133239994f254e1d6487ed6afbc3addac9fc..afdc927b2f27cee03d2f2dae7187ed8d3082a660 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java
@@ -14,6 +14,7 @@ import java.util.HashMap;
  import com.ibm.icu.impl.Assert;
  import com.ibm.icu.impl.Utility;
  import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UProperty;
  
  /**
    *  This class is part of the Rule Based Break Iterator rule compiler.
@@ -696,17 +697,16 @@ class RBBIRuleScanner {
      static String stripRules(String rules) {
          StringBuilder strippedRules = new StringBuilder();
          int rulesLength = rules.length();
-        for (int idx = 0; idx < rulesLength;) {
-            char ch = rules.charAt(idx++);
-            if (ch == '#') {
-                while (idx < rulesLength
-                        && ch != '\r' && ch != '\n' && ch != chNEL) {
-                    ch = rules.charAt(idx++);
-                }
-            }
-            if (!UCharacter.isISOControl(ch)) {
-                strippedRules.append(ch);
+        boolean skippingSpaces = false;
+
+        for (int idx = 0; idx < rulesLength; idx = rules.offsetByCodePoints(idx, 1)) {
+            int cp = rules.codePointAt(idx);
+            boolean whiteSpace = UCharacter.hasBinaryProperty(cp, UProperty.PATTERN_WHITE_SPACE);
+            if (skippingSpaces && whiteSpace) {
+                continue;
              }
+            strippedRules.appendCodePoint(cp);
+            skippingSpaces = whiteSpace;
          }
          return strippedRules.toString();
      }
@@ -800,6 +800,7 @@ class RBBIRuleScanner {
                  //  It will be treated as white-space, and serves to break up anything
                  //    that might otherwise incorrectly clump together with a comment in
                  //    the middle (a variable name, for example.)
+                int commentStart = fScanIndex;
                  for (;;) {
                      c.fChar = nextCharLL();
                      if (c.fChar == -1 || // EOF
@@ -811,6 +812,9 @@ class RBBIRuleScanner {
                          break;
                      }
                  }
+                for (int i=commentStart; i<fNextIndex-1; ++i) {
+                    fRB.fStrippedRules.setCharAt(i, ' ');
+                }
              }
              if (c.fChar == -1) {
                  return;
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java

index 1696e34bfff07abb372ff20fe2fbb5337ed7ef86..e7995ed3af740ce04cda542a8ef9f81d770a1f9a 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java
@@ -548,4 +548,18 @@ public class RBBITest extends TestFmwk {
          assertEquals("", t1.fExpectedBoundaries, t1.fBoundaries);
          assertEquals("", t2.fExpectedBoundaries, t2.fBoundaries);
      }
+
+    @Test
+    public void TestBug12677() {
+        // Check that stripping of comments from rules for getRules() is not confused by
+        // the presence of '#' characters in the rules that do not introduce comments.
+        String rules = "!!forward; \n"
+                     + "$x = [ab#];  # a set with a # literal. \n"
+                     + " # .;        # a comment that looks sort of like a rule.   \n"
+                     + " '#' '?';    # a rule with a quoted #   \n";
+
+        RuleBasedBreakIterator bi  = new RuleBasedBreakIterator(rules);
+        String rtRules = bi.toString();        // getRules() in C++
+        assertEquals("Break Iterator rule stripping test", "!!forward; $x = [ab#]; '#' '?'; ",  rtRules);
+    }
  }
author	Andy Heninger <andy.heninger@gmail.com>
	Sat, 3 Feb 2018 19:10:50 +0000 (19:10 +0000)
committer	Andy Heninger <andy.heninger@gmail.com>
	Sat, 3 Feb 2018 19:10:50 +0000 (19:10 +0000)
icu4c/source/common/rbbirb.cpp		patch \| blob \| history
icu4c/source/common/rbbirb.h		patch \| blob \| history
icu4c/source/common/rbbiscan.cpp		patch \| blob \| history
icu4c/source/test/intltest/rbbiapts.cpp		patch \| blob \| history
icu4c/source/test/intltest/rbbitst.cpp		patch \| blob \| history
icu4c/source/test/intltest/rbbitst.h		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java		patch \| blob \| history