]> granicus.if.org Git - icu/commitdiff
ICU-21127 Error when rbbi got unpaired surrogate char
authorFrank Tang <ftang@chromium.org>
Fri, 5 Mar 2021 22:25:53 +0000 (22:25 +0000)
committerFrank Yung-Fong Tang <ftang@google.com>
Fri, 5 Mar 2021 23:45:37 +0000 (15:45 -0800)
See #1520

icu4c/source/common/rbbiscan.cpp
icu4c/source/test/intltest/rbbitst.cpp
icu4c/source/test/intltest/rbbitst.h
icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java
icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java

index 10b7e9b68ee798185b5594ee7570cb00e1498050..45911b1cfe0d5cc63f590d6c453fd7705a518b89 100644 (file)
@@ -856,6 +856,10 @@ UChar32  RBBIRuleScanner::nextCharLL() {
         return (UChar32)-1;
     }
     ch         = fRB->fRules.char32At(fNextIndex);
+    if (U_IS_SURROGATE(ch)) {
+        error(U_ILLEGAL_CHAR_FOUND);
+        return U_SENTINEL;
+    }
     fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1);
 
     if (ch == chCR ||
index 8e3086b515199214c4607e181dcdeb986110c228..b02478c48bff6a0f948cd33db3d14bc6138874c7 100644 (file)
@@ -134,6 +134,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
     TESTCASE_AUTO(Test16BitsTrieWith16BitStateTable);
     TESTCASE_AUTO(TestTable_8_16_Bits);
     TESTCASE_AUTO(TestBug13590);
+    TESTCASE_AUTO(TestUnpairedSurrogate);
 
 #if U_ENABLE_TRACING
     TESTCASE_AUTO(TestTraceCreateCharacter);
@@ -5323,4 +5324,43 @@ void RBBITest::TestTraceCreateBreakEngine(void) {
 }
 #endif
 
+void RBBITest::TestUnpairedSurrogate() {
+    UnicodeString rules(u"ab;");
+
+    UErrorCode status = U_ZERO_ERROR;
+    UParseError pe;
+    RuleBasedBreakIterator bi1(rules, pe, status);
+    assertSuccess(WHERE, status);
+    UnicodeString rtRules = bi1.getRules();
+    // make sure the simple one work first.
+    assertEquals(WHERE, rules,  rtRules);
+
+
+    rules = UnicodeString(u"a\\ud800b;").unescape();
+    pe.line = 0;
+    pe.offset = 0;
+    RuleBasedBreakIterator bi2(rules, pe, status);
+    assertEquals(WHERE "unpaired lead surrogate", U_ILLEGAL_CHAR_FOUND , status);
+    if (pe.line != 1 || pe.offset != 1) {
+        errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
+    }
+
+    status = U_ZERO_ERROR;
+    rules = UnicodeString(u"a\\ude00b;").unescape();
+    pe.line = 0;
+    pe.offset = 0;
+    RuleBasedBreakIterator bi3(rules, pe, status);
+    assertEquals(WHERE "unpaired tail surrogate", U_ILLEGAL_CHAR_FOUND , status);
+    if (pe.line != 1 || pe.offset != 1) {
+        errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
+    }
+
+    // make sure the surrogate one work too.
+    status = U_ZERO_ERROR;
+    rules = UnicodeString(u"a😀b;");
+    RuleBasedBreakIterator bi4(rules, pe, status);
+    rtRules = bi4.getRules();
+    assertEquals(WHERE, rules, rtRules);
+}
+
 #endif // #if !UCONFIG_NO_BREAK_ITERATION
index da144114af8003caad606553ac8fe1713bfe8037..754b3e69ea3e81097d478d159f6a333dea066e6a 100644 (file)
@@ -83,6 +83,7 @@ public:
     void TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi);
     void TestBug13692();
     void TestDebugRules();
+    void TestUnpairedSurrogate();
 
     void TestDebug();
     void TestProperties();
index b086e32655de39caa07519b844fbf4e832c8f601..1da270347c8a691fa6d9b7d9bf5d84d13ce41f05 100644 (file)
@@ -85,6 +85,9 @@ class RBBIRuleBuilder {
     //    using these simplified the porting, and consolidated the
     //    creation of Java exceptions
     //
+    static final int U_ILLEGAL_CHAR_FOUND = 12;
+    /**< Character conversion: Illegal input sequence/combination of input units. */
+
     static final int U_BRK_ERROR_START = 0x10200;
     /**< Start of codes indicating Break Iterator failures */
 
index fae2773bac5161a506db5183f3d41914c18b42ca..c9a8aff5a6d9bf4010b8ca78dd8574694bed8d91 100644 (file)
@@ -723,6 +723,9 @@ class RBBIRuleScanner {
             return -1;
         }
         ch = UTF16.charAt(fRB.fRules, fNextIndex);
+        if (Character.isBmpCodePoint(ch) && Character.isSurrogate((char)ch)) {
+            error(RBBIRuleBuilder.U_ILLEGAL_CHAR_FOUND);
+        }
         fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex, 1);
 
         if (ch == '\r' ||
index 268f6c03f44a9771f98a7b673307e9e2601be06a..dae29ad07856b5a3d909751bedfe422964500bb6 100644 (file)
@@ -905,4 +905,41 @@ public class RBBITest extends TestFmwk {
         assertEquals("Wrong number of breaks found", 2, breaksFound);
     }
 
+    /* Test handling of unpair surrogate.
+     */
+    @Test
+    public void TestUnpairedSurrogate() {
+        // make sure the simple one work first.
+        String rules = "ab;";
+        RuleBasedBreakIterator bi = new RuleBasedBreakIterator(rules);
+        assertEquals("Rules does not match", rules, bi.toString());
+
+        try {
+            new RuleBasedBreakIterator("a\ud800b;");
+            fail("TestUnpairedSurrogate: RuleBasedBreakIterator() failed to throw an exception with unpair low surrogate.");
+        }
+        catch (IllegalArgumentException e) {
+            // expected exception with unpair surrogate.
+        }
+        catch (Exception e) {
+            fail("TestUnpairedSurrogate: Unexpected exception while new RuleBasedBreakIterator() with unpair low surrogate: " + e);
+        }
+
+        try {
+            new RuleBasedBreakIterator("a\ude00b;");
+            fail("TestUnpairedSurrogate: RuleBasedBreakIterator() failed to throw an exception with unpair high surrogate.");
+        }
+        catch (IllegalArgumentException e) {
+            // expected exception with unpair surrogate.
+        }
+        catch (Exception e) {
+            fail("TestUnpairedSurrogate: Unexpected exception while new RuleBasedBreakIterator() with unpair high surrogate: " + e);
+        }
+
+
+        // make sure the surrogate one work too.
+        rules = "a😀b;";
+        bi = new RuleBasedBreakIterator(rules);
+        assertEquals("Rules does not match", rules, bi.toString());
+    }
 }