]> granicus.if.org Git - icu/commitdiff
ICU-8576 Dictionary break test updates from George Rhoten
authorPeter Edberg <pedberg@unicode.org>
Wed, 13 Jul 2011 06:49:42 +0000 (06:49 +0000)
committerPeter Edberg <pedberg@unicode.org>
Wed, 13 Jul 2011 06:49:42 +0000 (06:49 +0000)
X-SVN-Rev: 30327

icu4c/source/test/intltest/Makefile.in
icu4c/source/test/intltest/dicttest.cpp [new file with mode: 0644]
icu4c/source/test/intltest/dicttest.h [new file with mode: 0644]
icu4c/source/test/intltest/intltest.vcxproj
icu4c/source/test/intltest/intltest.vcxproj.filters
icu4c/source/test/intltest/itrbbi.cpp
icu4c/source/test/intltest/rbbitst.cpp
icu4c/source/test/intltest/rbbitst.h
icu4c/source/test/testdata/wordsegments.txt [new file with mode: 0644]

index e603509682e7b813badfa3808181cb0b0f3f24b1..fb1e67a02bfd51badeb6ac8da2fc84b69cfb5084 100644 (file)
@@ -51,7 +51,7 @@ tfsmalls.o tmsgfmt.o trcoll.o tscoll.o tsdate.o tsdcfmsy.o tsdtfmsy.o \
 tsmthred.o tsnmfmt.o tsputil.o tstnrapi.o tstnorm.o tzbdtest.o         \
 tzregts.o tztest.o ucdtest.o usettest.o ustrtest.o strcase.o transtst.o strtest.o thcoll.o \
 bytestrietest.o ucharstrietest.o \
-itrbbi.o rbbiapts.o rbbitst.o ittrans.o transapi.o cpdtrtst.o \
+itrbbi.o rbbiapts.o dicttest.o rbbitst.o ittrans.o transapi.o cpdtrtst.o \
 testutil.o transrt.o trnserr.o normconf.o sfwdchit.o \
 jamotest.o srchtest.o reptest.o regextst.o \
 itrbnf.o itrbnfrt.o itrbnfp.o ucaconf.o icusvtst.o \
diff --git a/icu4c/source/test/intltest/dicttest.cpp b/icu4c/source/test/intltest/dicttest.cpp
new file mode 100644 (file)
index 0000000..646084a
--- /dev/null
@@ -0,0 +1,140 @@
+/*
+**********************************************************************
+* Copyright (C) 2011-2011, International Business Machines Corporation 
+* and others.  All Rights Reserved.
+**********************************************************************
+************************************************************************
+*   Date          Name        Description
+*   05/14/2011    grhoten     Creation.
+************************************************************************/
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+#include "dicttest.h"
+#include "textfile.h"
+#include "uvector.h"
+#include "unicode/rbbi.h"
+
+void DictionaryWordTest::TestThaiBreaks() {
+    UErrorCode status=U_ZERO_ERROR;
+    BreakIterator* b;
+    Locale locale = Locale("th");
+    int32_t p, index;
+    UChar c[]= { 
+            0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B, 
+            0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19, 
+            0x0E16, 0x0E49, 0x0E33, 0x0000
+    };
+    int32_t expectedWordResult[] = {
+            2, 3, 6, 10, 11, 15, 17, 20, 22
+    };
+    int32_t expectedLineResult[] = {
+            3, 6, 11, 15, 17, 20, 22
+    };
+
+    int32_t size = u_strlen(c);
+    UnicodeString text=UnicodeString(c);
+    
+    b = BreakIterator::createWordInstance(locale, status);
+    if (U_FAILURE(status)) {
+        errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status));
+        return;
+    }
+    b->setText(text);
+    p = index = 0;
+    while ((p=b->next())!=BreakIterator::DONE && p < size) {
+        if (p != expectedWordResult[index++]) {
+            errln("Incorrect break given by thai word break iterator. Expected: %d  Got: %d", expectedWordResult[index-1], p);
+        }
+    }
+    delete b;
+    
+    b = BreakIterator::createLineInstance(locale, status);
+    if (U_FAILURE(status)) {
+        printf("Unable to create thai line break iterator.\n");
+        return;
+    }
+    b->setText(text);
+    p = index = 0;
+    while ((p=b->next())!=BreakIterator::DONE && p < size) {
+        if (p != expectedLineResult[index++]) {
+            errln("Incorrect break given by thai line break iterator. Expected: %d  Got: %d", expectedLineResult[index-1], p);
+        }
+    }
+
+    delete b;
+}
+
+#define DICTIONARY_TEST_FILE "wordsegments.txt"
+
+void DictionaryWordTest::TestWordBoundaries() {
+    UErrorCode      status  = U_ZERO_ERROR;
+
+    TextFile phrases(DICTIONARY_TEST_FILE, "UTF8", status);
+    if (U_FAILURE(status)) {
+        dataerrln("Can't open "DICTIONARY_TEST_FILE": %s; skipping test",
+              u_errorName(status));
+        return;
+    }
+
+    // Due to how the word break iterator works,
+    // scripts for languages that use no spaces should use the correct dictionary by default.
+    BreakIterator *wb = BreakIterator::createWordInstance("en", status);
+    if (U_FAILURE(status)) {
+        dataerrln("Word break iterator can not be opened: %s; skipping test",
+              u_errorName(status));
+        return;
+    }
+
+    int32_t pos, pIdx;
+    int32_t testLines = 0;
+    UnicodeString phrase;
+    while (phrases.readLineSkippingComments(phrase, status, FALSE) && U_SUCCESS(status)) {
+        UVector breaks(status);
+
+        for (pIdx = 0; pIdx < phrase.length(); pIdx++) {
+            if (phrase.charAt(pIdx) == 0x007C /* | */) {
+                breaks.addElement(pIdx, status);
+                phrase.remove(pIdx, 1);
+            }
+        }
+        breaks.addElement(pIdx, status);
+
+        wb->setText(phrase);
+        int32_t brkArrPos = 0;
+        while ((pos=wb->next())!=BreakIterator::DONE) {
+            int32_t expectedPos = breaks.elementAti(brkArrPos);
+            if (expectedPos != pos) {
+                errln("Incorrect forward word break on line %d. Expected: %d  Got: %d",
+                    phrases.getLineNumber(), breaks.elementAt(brkArrPos), pos);
+            }
+            brkArrPos++;
+        }
+        brkArrPos = breaks.size() - 1;
+        while ((pos=wb->previous())!=BreakIterator::DONE) {
+            brkArrPos--;
+            int32_t expectedPos = breaks.elementAti(brkArrPos);
+            if (expectedPos != pos) {
+                errln("Incorrect backward word break on line %d. Expected: %d  Got: %d",
+                    phrases.getLineNumber(), breaks.elementAt(brkArrPos), pos);
+            }
+        }
+        testLines++;
+    }
+    delete wb;
+    logln("%d tests were run.", testLines);
+}
+
+void DictionaryWordTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par */)
+{
+    if (exec) logln("TestSuite DictionaryWordTest: ");
+    TESTCASE_AUTO_BEGIN;
+    TESTCASE_AUTO(TestThaiBreaks);
+    TESTCASE_AUTO(TestWordBoundaries);
+    TESTCASE_AUTO_END;
+}
+
+
+#endif
diff --git a/icu4c/source/test/intltest/dicttest.h b/icu4c/source/test/intltest/dicttest.h
new file mode 100644 (file)
index 0000000..ffce470
--- /dev/null
@@ -0,0 +1,31 @@
+/*
+**********************************************************************
+* Copyright (C) 2011-2011, International Business Machines Corporation 
+* and others.  All Rights Reserved.
+**********************************************************************
+************************************************************************
+*   Date          Name        Description
+*   05/14/2011    grhoten     Creation.
+************************************************************************/
+
+#ifndef DICTTEST_H
+#define DICTTEST_H
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+#include "intltest.h"
+
+
+class DictionaryWordTest: public IntlTest {
+public:
+    void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
+    void TestWordBoundaries();
+    void TestThaiBreaks();
+};
+
+#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
+
+#endif
+
index 52ada6a373e1e68bdef21ad933abd0c94f47958f..613d63564931417d65bb3f0da2a4758d2f000ec0 100644 (file)
   </ItemDefinitionGroup>\r
   <ItemGroup>\r
     <ClCompile Include="bytestrietest.cpp" />\r
+    <ClCompile Include="dicttest.cpp" />\r
     <ClCompile Include="ucharstrietest.cpp" />\r
     <ClCompile Include="itrbbi.cpp" />\r
     <ClCompile Include="rbbiapts.cpp" />\r
     <ClCompile Include="bidiconf.cpp" />\r
   </ItemGroup>\r
   <ItemGroup>\r
+    <ClInclude Include="dicttest.h" />\r
     <ClInclude Include="itrbbi.h" />\r
     <ClInclude Include="rbbiapts.h" />\r
     <ClInclude Include="rbbitst.h" />\r
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />\r
   <ImportGroup Label="ExtensionTargets">\r
   </ImportGroup>\r
-</Project>
\ No newline at end of file
+</Project>\r
index d7b4159b8caa36bc69e6d105039ab4eb50e6a4c5..11738b65b5dbf60171cbd07a19beba7eddd7a0dc 100644 (file)
     <ClCompile Include="alphaindextst.cpp">\r
       <Filter>collation</Filter>\r
     </ClCompile>\r
+    <ClCompile Include="dicttest.cpp">\r
+      <Filter>break iteration</Filter>\r
+    </ClCompile>\r
   </ItemGroup>\r
   <ItemGroup>\r
     <ClInclude Include="itrbbi.h">\r
     <ClInclude Include="alphaindextst.h">\r
       <Filter>collation</Filter>\r
     </ClInclude>\r
+    <ClInclude Include="dicttest.h">\r
+      <Filter>break iteration</Filter>\r
+    </ClInclude>\r
   </ItemGroup>\r
-</Project>
\ No newline at end of file
+</Project>\r
index 75f01cff0d5e9a6d747b0347bc8d3c77a1f0b99a..c6deee06c3b405701f90a18c4d5c3b287988f85e 100644 (file)
@@ -1,6 +1,6 @@
 /*
 **********************************************************************
-* Copyright (C) 1998-2001, International Business Machines Corporation 
+* Copyright (C) 1998-2011, International Business Machines Corporation 
 * and others.  All Rights Reserved.
 **********************************************************************
 */
 #include "itrbbi.h"
 #include "rbbiapts.h"
 #include "rbbitst.h"
+#include "dicttest.h"
+
+#define TESTCLASS(n,classname)        \
+    case n:                           \
+        name = #classname;            \
+        if (exec) {                   \
+            logln(#classname "---");  \
+            logln("");                \
+            classname t;              \
+            callTest(t, par);         \
+        }                             \
+        break
+
 
 void IntlTestRBBI::runIndexedTest( int32_t index, UBool exec, const char* &name, char* par )
 {
     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
     switch (index) {
-        case 0:
-            name = "RBBIAPITest"; 
-            if (exec) {
-                logln("RBBIAPITest--"); logln("");
-                RBBIAPITest test;
-                callTest( test, par );
-            }
-            break;
-
-        case 1:
-           name = "RBBITest"; 
-            if (exec) {
-                logln("RBBITest---"); logln("");
-                RBBITest test;
-                callTest( test, par );
-            }
-            break;
+        TESTCLASS(0, RBBIAPITest);
+        TESTCLASS(1, RBBITest);
+        TESTCLASS(2, DictionaryWordTest);
         default: name=""; break;
     }
 }
index aed26287c1369f95abca1241fc683b9c566cb256..7dfeed776aa19ca35d6afb750113e408c44ab324 100644 (file)
@@ -134,17 +134,15 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
 #if !UCONFIG_NO_FILE_IO
         case 21: name = "TestBug5775";
             if (exec) TestBug5775();                           break;
-        case 22: name = "TestThaiBreaks";
-            if (exec) TestThaiBreaks();                        break;
-        case 23: name = "TestTailoredBreaks";
+        case 22: name = "TestTailoredBreaks";
             if (exec) TestTailoredBreaks();                    break;
 #else
-        case 21: case 22: case 23: name = "skip";
+        case 21: case 22: name = "skip";
             break;
 #endif
-        case 24: name = "TestDictRules";
+        case 23: name = "TestDictRules";
             if (exec) TestDictRules();                         break;
-        case 25: name = "TestBug5532";
+        case 24: name = "TestBug5532";
             if (exec) TestBug5532();                           break;
         default: name = ""; break; //needed to end loop
     }
@@ -1810,56 +1808,6 @@ end_test:
 #endif
 }
 
-void RBBITest::TestThaiBreaks() {
-    UErrorCode status=U_ZERO_ERROR;
-    BreakIterator* b;
-    Locale locale = Locale("th");
-    int32_t p, index;
-    UChar c[]= { 
-            0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B, 
-            0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19, 
-            0x0E16, 0x0E49, 0x0E33, 0x0000
-    };
-    int32_t expectedWordResult[] = {
-            2, 3, 6, 10, 11, 15, 17, 20, 22
-    };
-    int32_t expectedLineResult[] = {
-            3, 6, 11, 15, 17, 20, 22
-    };
-
-    int32_t size = u_strlen(c);
-    UnicodeString text=UnicodeString(c);
-    
-    b = BreakIterator::createWordInstance(locale, status);
-    if (U_FAILURE(status)) {
-        errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status));
-        return;
-    }
-    b->setText(text);
-    p = index = 0;
-    while ((p=b->next())!=BreakIterator::DONE && p < size) {
-        if (p != expectedWordResult[index++]) {
-            errln("Incorrect break given by thai word break iterator. Expected: %d  Got: %d", expectedWordResult[index-1], p);
-        }
-    }
-    delete b;
-    
-    b = BreakIterator::createLineInstance(locale, status);
-    if (U_FAILURE(status)) {
-        printf("Unable to create thai line break iterator.\n");
-        return;
-    }
-    b->setText(text);
-    p = index = 0;
-    while ((p=b->next())!=BreakIterator::DONE && p < size) {
-        if (p != expectedLineResult[index++]) {
-            errln("Incorrect break given by thai line break iterator. Expected: %d  Got: %d", expectedLineResult[index-1], p);
-        }
-    }
-
-    delete b;
-}
-
 // UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX"
 // Words don't include colon or period (cldrbug #1969).
 static const char    posxWordText[]     = "Can't have breaks in xx:yy or struct.field for CS-types.";
index d46c9b59976450433a79f189964a93e20cda0e0c..7effb799e78fd4faede3687a9c7e5f1566ebadbc 100644 (file)
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 1999-2010, International Business Machines
+ * Copyright (c) 1999-2011, International Business Machines
  * Corporation and others. All Rights Reserved.
  *************************************************************************
  *   Date        Name        Description
@@ -68,7 +68,6 @@ public:
     void TestTrieDict();
     void TestUnicodeFiles();
     void TestBug5775();
-    void TestThaiBreaks();
     void TestTailoredBreaks();
     void TestDictRules();
     void TestBug5532();
diff --git a/icu4c/source/test/testdata/wordsegments.txt b/icu4c/source/test/testdata/wordsegments.txt
new file mode 100644 (file)
index 0000000..2dab96d
--- /dev/null
@@ -0,0 +1,23 @@
+# Copyright (C) 2011-2011, International Business Machines Corporation
+# and others. All Rights Reserved.
+#
+#   file name:  wordsegments.txt
+#   encoding:   UTF-8
+#
+#   created on: 2011may14
+#   created by: George Rhoten
+#   created by: Nathan Wells
+#
+# Word boundary test data for languages that contain no spaces.
+# Boundaries are deliminated with the | character so that it's easier to debug.
+#
+# If you have test data with zero width spaces to deliminate the words, use the following command example.
+# Be sure to copy the zero width space in the sed command.
+# echo 'សូម​ចំណាយពេល​បន្តិច​ដើម្បី​អធិស្ឋាន​អរ​ព្រះគុណ​ដល់​ព្រះអង្គ' | sed 's/​/\|/g'
+#
+
+# Thai
+กู| |กิน|กุ้ง| |ปิ้่|งอ|ยู่|ใน|ถ้ำ
+
+# Khmer
+សូម|ចំណាយពេល|បន្តិច|ដើម្បី|អធិស្ឋាន|អរ|ព្រះគុណ|ដល់|ព្រះអង្គ