]> granicus.if.org Git - icu/commitdiff
ICU-11665 Inconsistent word break of Japanese text.
authorAndy Heninger <andy.heninger@gmail.com>
Sat, 16 May 2015 01:05:26 +0000 (01:05 +0000)
committerAndy Heninger <andy.heninger@gmail.com>
Sat, 16 May 2015 01:05:26 +0000 (01:05 +0000)
X-SVN-Rev: 37448

icu4c/source/data/brkitr/root.txt
icu4c/source/test/cintltst/cbiapts.c

index ea65a8d9241f367d8ac9930a0902fd41e7e6b386..32b9fa8368da1098203fa176e1a61988062cec76 100644 (file)
@@ -21,7 +21,7 @@ root{
     dictionaries{
         Hani:process(dependency){"cjdict.dict"}
         Hira:process(dependency){"cjdict.dict"}
-        Kata:process(dependency){"cjdict.dict"}
+        Kana:process(dependency){"cjdict.dict"}
         Khmr:process(dependency){"khmerdict.dict"}
         Laoo:process(dependency){"laodict.dict"}
         Mymr:process(dependency){"burmesedict.dict"}
index c9080f2d153a828cfe44edce22c4665782b9a0bc..7cea5c6a86b5c630ab112865f4ca1f53a49c2f92 100644 (file)
@@ -1,6 +1,6 @@
 /********************************************************************
  * COPYRIGHT: 
- * Copyright (c) 1997-2013, International Business Machines Corporation and
+ * Copyright (c) 1997-2015, International Business Machines Corporation and
  * others. All Rights Reserved.
  ********************************************************************/
 /********************************************************************************
@@ -31,6 +31,7 @@
 #include "unicode/utext.h"
 #include "cintltst.h"
 #include "cbiapts.h"
+#include "cmemory.h"
 
 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
 log_data_err("Failure at file %s, line %d, error = %s (Are you missing data?)\n", __FILE__, __LINE__, u_errorName(status));}}
@@ -47,6 +48,7 @@ static void TestBreakIteratorStatusVec(void);
 static void TestBreakIteratorUText(void);
 static void TestBreakIteratorTailoring(void);
 static void TestBreakIteratorRefresh(void);
+static void TestBug11665(void);
 
 void addBrkIterAPITest(TestNode** root);
 
@@ -62,6 +64,7 @@ void addBrkIterAPITest(TestNode** root)
     addTest(root, &TestBreakIteratorStatusVec, "tstxtbd/cbiapts/TestBreakIteratorStatusVec");
     addTest(root, &TestBreakIteratorTailoring, "tstxtbd/cbiapts/TestBreakIteratorTailoring");
     addTest(root, &TestBreakIteratorRefresh, "tstxtbd/cbiapts/TestBreakIteratorRefresh");
+    addTest(root, &TestBug11665, "tstxtbd/cbiapts/TestBug11665");
 }
 
 #define CLONETEST_ITERATOR_COUNT 2
@@ -879,4 +882,54 @@ static void TestBreakIteratorRefresh(void) {
     ubrk_close(bi);
 }
 
+
+static void TestBug11665(void) {
+    // The problem was with the incorrect breaking of Japanese text beginning
+    // with Katakana characters when no prior Japanese or Chinese text had been
+    // encountered.
+    //
+    // Tested here in cintltst, rather than in intltest, because only cintltst
+    // tests have the ability to reset ICU, which is needed to get the bug
+    // to manifest itself.
+
+    static UChar japaneseText[] = {0x30A2, 0x30EC, 0x30EB, 0x30AE, 0x30FC, 0x6027, 0x7D50, 0x819C, 0x708E};
+    int32_t boundaries[10] = {0};
+    UBreakIterator *bi = NULL;
+    int32_t brk;
+    int32_t brkIdx = 0;
+    int32_t totalBreaks = 0;
+    UErrorCode status = U_ZERO_ERROR;
+
+    ctest_resetICU();
+    bi = ubrk_open(UBRK_WORD, "en_US", japaneseText, UPRV_LENGTHOF(japaneseText), &status);
+    TEST_ASSERT_SUCCESS(status);
+    for (brk=ubrk_first(bi); brk != UBRK_DONE; brk=ubrk_next(bi)) {
+        boundaries[brkIdx] = brk;
+        if (++brkIdx >= UPRV_LENGTHOF(boundaries) - 1) {
+            break;
+        }
+    }
+    if (brkIdx <= 2 || brkIdx >= UPRV_LENGTHOF(boundaries)) {
+        log_err("%s:%d too few or many breaks found.\n", __FILE__, __LINE__);
+    } else {
+        totalBreaks = brkIdx;
+        brkIdx = 0;
+        for (brk=ubrk_first(bi); brk != UBRK_DONE; brk=ubrk_next(bi)) {
+            if (brk != boundaries[brkIdx]) {
+                log_err("%s:%d Break #%d differs between first and second iteration.\n", __FILE__, __LINE__, brkIdx);
+                break;
+            }
+            if (++brkIdx >= UPRV_LENGTHOF(boundaries) - 1) {
+                log_err("%s:%d Too many breaks.\n", __FILE__, __LINE__);
+                break;
+            }
+        }
+        if (totalBreaks != brkIdx) {
+            log_err("%s:%d Number of breaks differ between first and second iteration.\n", __FILE__, __LINE__);
+        }
+    }
+    ubrk_close(bi);
+}
+
+
 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */