]> granicus.if.org Git - icu/commitdiff
ICU-21178 Add check for corrupt rbbitst.txt data.
authorAndy Heninger <andy.heninger@gmail.com>
Wed, 8 Jul 2020 00:12:09 +0000 (17:12 -0700)
committerAndy Heninger <andy.heninger@gmail.com>
Fri, 24 Jul 2020 22:16:12 +0000 (15:16 -0700)
In the test data from rbbitst.txt, two or more adjacent boundary markers with
no intervening test data were accepted, with no indication of a problem.

This situation occurred, as described in bug ICU-21178, with a bad import of
some test cases from CLDR. PR #1194 corrected the problem with the test data
in ICU4C. This PR adds code to flag this situation in the test data, and
also propagates the data fix to ICU4J's copy of rbbitst.txt.

icu4c/source/test/intltest/rbbitst.cpp
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt

index c3d12881d4203a80c52db8751e8f00b39bee9187..8e3086b515199214c4607e181dcdeb986110c228 100644 (file)
@@ -905,6 +905,10 @@ void RBBITest::TestExtended() {
         case PARSE_DATA:
             if (c == u'•') {
                 int32_t  breakIdx = tp.dataToBreak.length();
+                if (tp.expectedBreaks->size() > breakIdx) {
+                    errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
+                          lineNum, column);
+                }
                 tp.expectedBreaks->setSize(breakIdx+1);
                 tp.expectedBreaks->setElementAt(-1, breakIdx);
                 tp.srcLine->setSize(breakIdx+1);
@@ -1069,6 +1073,10 @@ void RBBITest::TestExtended() {
                     tagValue = -1;
                 }
                 int32_t  breakIdx = tp.dataToBreak.length();
+                if (tp.expectedBreaks->size() > breakIdx) {
+                    errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
+                          lineNum, column);
+                }
                 tp.expectedBreaks->setSize(breakIdx+1);
                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
                 tp.srcLine->setSize(breakIdx+1);
index a24d4ee26a32da71e682b9caaae1a20e9ee10380..d35195b49c2c5a81b7e2e4df6874c15c4bff54fb 100644 (file)
@@ -245,6 +245,11 @@ public void TestExtended() {
         case PARSE_DATA:
             if (c == '•') {
                 int  breakIdx = tp.dataToBreak.length();
+                if (tp.expectedBreaks[breakIdx] != 0) {
+                    errln(String.format(
+                            "rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
+                            lineNum, column));
+                }
                 tp.expectedBreaks[breakIdx] = -1;
                 tp.srcLine[breakIdx]        = lineNum;
                 tp.srcCol[breakIdx]         = column;
@@ -388,6 +393,11 @@ public void TestExtended() {
                     tagValue = -1;
                 }
                 int  breakIdx = tp.dataToBreak.length();
+                if (tp.expectedBreaks[breakIdx] != 0) {
+                    errln(String.format(
+                            "rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
+                            lineNum, column));
+                }
                 tp.expectedBreaks[breakIdx] = tagValue;
                 tp.srcLine[breakIdx]        = lineNum;
                 tp.srcCol[breakIdx]         = column;
index 9962f94e40a8deaea9b8fb64839e57e1082d730f..98cf6883d72ba67b3bec23ca9c4c4244d398eae1 100644 (file)
 # वृद्धिसँग ;
 <data>•वृ•द्धि•सँ•ग•</data>
 # अंतःज्ञानी  ;
-<data>•अं•तः•ज्ञा•नी••</data>
+<data>•अं•तः•ज्ञा•नी• •</data>
 # गन्नदी॑धिम ;
 <data>•ग•न्न•दी॑•धि•म•</data>
 # प्प्रप॑द्ये॒ ;
 # भर्तुर्भोगः ;
 <data>•भ•र्तु•र्भो•गः•</data>
 # शॆत्युल  ;
-<data>•शॆ•त्यु•ल••</data>
+<data>•शॆ•त्यु•ल• •</data>
 # महारॆन्य ;
 <data>•म•हा•रॆ•न्य•</data>
 # सॆक्युल ;