From: Peter Edberg <pedberg@unicode.org>
Date: Wed, 13 Jul 2011 06:49:42 +0000 (+0000)
Subject: ICU-8576 Dictionary break test updates from George Rhoten
X-Git-Tag: milestone-59-0-1~4650
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=62d26cc5fe8cc66696143c44a43d664c2d7be100;p=icu

ICU-8576 Dictionary break test updates from George Rhoten

X-SVN-Rev: 30327
---

diff --git a/icu4c/source/test/intltest/Makefile.in b/icu4c/source/test/intltest/Makefile.in
index e603509682e..fb1e67a02bf 100644
--- a/icu4c/source/test/intltest/Makefile.in
+++ b/icu4c/source/test/intltest/Makefile.in
@@ -51,7 +51,7 @@ tfsmalls.o tmsgfmt.o trcoll.o tscoll.o tsdate.o tsdcfmsy.o tsdtfmsy.o	\
 tsmthred.o tsnmfmt.o tsputil.o tstnrapi.o tstnorm.o tzbdtest.o		\
 tzregts.o tztest.o ucdtest.o usettest.o ustrtest.o strcase.o transtst.o strtest.o thcoll.o \
 bytestrietest.o ucharstrietest.o \
-itrbbi.o rbbiapts.o rbbitst.o ittrans.o transapi.o cpdtrtst.o \
+itrbbi.o rbbiapts.o dicttest.o rbbitst.o ittrans.o transapi.o cpdtrtst.o \
 testutil.o transrt.o trnserr.o normconf.o sfwdchit.o \
 jamotest.o srchtest.o reptest.o regextst.o \
 itrbnf.o itrbnfrt.o itrbnfp.o ucaconf.o icusvtst.o \
diff --git a/icu4c/source/test/intltest/dicttest.cpp b/icu4c/source/test/intltest/dicttest.cpp
new file mode 100644
index 00000000000..646084a20d0
--- /dev/null
+++ b/icu4c/source/test/intltest/dicttest.cpp
@@ -0,0 +1,140 @@
+/*
+**********************************************************************
+* Copyright (C) 2011-2011, International Business Machines Corporation 
+* and others.  All Rights Reserved.
+**********************************************************************
+************************************************************************
+*   Date          Name        Description
+*   05/14/2011    grhoten     Creation.
+************************************************************************/
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+#include "dicttest.h"
+#include "textfile.h"
+#include "uvector.h"
+#include "unicode/rbbi.h"
+
+void DictionaryWordTest::TestThaiBreaks() {
+    UErrorCode status=U_ZERO_ERROR;
+    BreakIterator* b;
+    Locale locale = Locale("th");
+    int32_t p, index;
+    UChar c[]= { 
+            0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B, 
+            0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19, 
+            0x0E16, 0x0E49, 0x0E33, 0x0000
+    };
+    int32_t expectedWordResult[] = {
+            2, 3, 6, 10, 11, 15, 17, 20, 22
+    };
+    int32_t expectedLineResult[] = {
+            3, 6, 11, 15, 17, 20, 22
+    };
+
+    int32_t size = u_strlen(c);
+    UnicodeString text=UnicodeString(c);
+    
+    b = BreakIterator::createWordInstance(locale, status);
+    if (U_FAILURE(status)) {
+        errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status));
+        return;
+    }
+    b->setText(text);
+    p = index = 0;
+    while ((p=b->next())!=BreakIterator::DONE && p < size) {
+        if (p != expectedWordResult[index++]) {
+            errln("Incorrect break given by thai word break iterator. Expected: %d  Got: %d", expectedWordResult[index-1], p);
+        }
+    }
+    delete b;
+    
+    b = BreakIterator::createLineInstance(locale, status);
+    if (U_FAILURE(status)) {
+        printf("Unable to create thai line break iterator.\n");
+        return;
+    }
+    b->setText(text);
+    p = index = 0;
+    while ((p=b->next())!=BreakIterator::DONE && p < size) {
+        if (p != expectedLineResult[index++]) {
+            errln("Incorrect break given by thai line break iterator. Expected: %d  Got: %d", expectedLineResult[index-1], p);
+        }
+    }
+
+    delete b;
+}
+
+#define DICTIONARY_TEST_FILE "wordsegments.txt"
+
+void DictionaryWordTest::TestWordBoundaries() {
+    UErrorCode      status  = U_ZERO_ERROR;
+
+    TextFile phrases(DICTIONARY_TEST_FILE, "UTF8", status);
+    if (U_FAILURE(status)) {
+        dataerrln("Can't open "DICTIONARY_TEST_FILE": %s; skipping test",
+              u_errorName(status));
+        return;
+    }
+
+    // Due to how the word break iterator works,
+    // scripts for languages that use no spaces should use the correct dictionary by default.
+    BreakIterator *wb = BreakIterator::createWordInstance("en", status);
+    if (U_FAILURE(status)) {
+        dataerrln("Word break iterator can not be opened: %s; skipping test",
+              u_errorName(status));
+        return;
+    }
+
+    int32_t pos, pIdx;
+    int32_t testLines = 0;
+    UnicodeString phrase;
+    while (phrases.readLineSkippingComments(phrase, status, FALSE) && U_SUCCESS(status)) {
+        UVector breaks(status);
+
+        for (pIdx = 0; pIdx < phrase.length(); pIdx++) {
+            if (phrase.charAt(pIdx) == 0x007C /* | */) {
+                breaks.addElement(pIdx, status);
+                phrase.remove(pIdx, 1);
+            }
+        }
+        breaks.addElement(pIdx, status);
+
+        wb->setText(phrase);
+        int32_t brkArrPos = 0;
+        while ((pos=wb->next())!=BreakIterator::DONE) {
+            int32_t expectedPos = breaks.elementAti(brkArrPos);
+            if (expectedPos != pos) {
+                errln("Incorrect forward word break on line %d. Expected: %d  Got: %d",
+                    phrases.getLineNumber(), breaks.elementAt(brkArrPos), pos);
+            }
+            brkArrPos++;
+        }
+        brkArrPos = breaks.size() - 1;
+        while ((pos=wb->previous())!=BreakIterator::DONE) {
+            brkArrPos--;
+            int32_t expectedPos = breaks.elementAti(brkArrPos);
+            if (expectedPos != pos) {
+                errln("Incorrect backward word break on line %d. Expected: %d  Got: %d",
+                    phrases.getLineNumber(), breaks.elementAt(brkArrPos), pos);
+            }
+        }
+        testLines++;
+    }
+    delete wb;
+    logln("%d tests were run.", testLines);
+}
+
+void DictionaryWordTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par */)
+{
+    if (exec) logln("TestSuite DictionaryWordTest: ");
+    TESTCASE_AUTO_BEGIN;
+    TESTCASE_AUTO(TestThaiBreaks);
+    TESTCASE_AUTO(TestWordBoundaries);
+    TESTCASE_AUTO_END;
+}
+
+
+#endif
diff --git a/icu4c/source/test/intltest/dicttest.h b/icu4c/source/test/intltest/dicttest.h
new file mode 100644
index 00000000000..ffce470d576
--- /dev/null
+++ b/icu4c/source/test/intltest/dicttest.h
@@ -0,0 +1,31 @@
+/*
+**********************************************************************
+* Copyright (C) 2011-2011, International Business Machines Corporation 
+* and others.  All Rights Reserved.
+**********************************************************************
+************************************************************************
+*   Date          Name        Description
+*   05/14/2011    grhoten     Creation.
+************************************************************************/
+
+#ifndef DICTTEST_H
+#define DICTTEST_H
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+#include "intltest.h"
+
+
+class DictionaryWordTest: public IntlTest {
+public:
+    void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
+    void TestWordBoundaries();
+    void TestThaiBreaks();
+};
+
+#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
+
+#endif
+
diff --git a/icu4c/source/test/intltest/intltest.vcxproj b/icu4c/source/test/intltest/intltest.vcxproj
index 52ada6a373e..613d6356493 100644
--- a/icu4c/source/test/intltest/intltest.vcxproj
+++ b/icu4c/source/test/intltest/intltest.vcxproj
@@ -224,6 +224,7 @@
   </ItemDefinitionGroup>
   <ItemGroup>
     <ClCompile Include="bytestrietest.cpp" />
+    <ClCompile Include="dicttest.cpp" />
     <ClCompile Include="ucharstrietest.cpp" />
     <ClCompile Include="itrbbi.cpp" />
     <ClCompile Include="rbbiapts.cpp" />
@@ -389,6 +390,7 @@
     <ClCompile Include="bidiconf.cpp" />
   </ItemGroup>
   <ItemGroup>
+    <ClInclude Include="dicttest.h" />
     <ClInclude Include="itrbbi.h" />
     <ClInclude Include="rbbiapts.h" />
     <ClInclude Include="rbbitst.h" />
@@ -533,4 +535,4 @@
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
   </ImportGroup>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/icu4c/source/test/intltest/intltest.vcxproj.filters b/icu4c/source/test/intltest/intltest.vcxproj.filters
index d7b4159b8ca..11738b65b5d 100644
--- a/icu4c/source/test/intltest/intltest.vcxproj.filters
+++ b/icu4c/source/test/intltest/intltest.vcxproj.filters
@@ -444,6 +444,9 @@
     <ClCompile Include="alphaindextst.cpp">
       <Filter>collation</Filter>
     </ClCompile>
+    <ClCompile Include="dicttest.cpp">
+      <Filter>break iteration</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="itrbbi.h">
@@ -812,5 +815,8 @@
     <ClInclude Include="alphaindextst.h">
       <Filter>collation</Filter>
     </ClInclude>
+    <ClInclude Include="dicttest.h">
+      <Filter>break iteration</Filter>
+    </ClInclude>
   </ItemGroup>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/icu4c/source/test/intltest/itrbbi.cpp b/icu4c/source/test/intltest/itrbbi.cpp
index 75f01cff0d5..c6deee06c3b 100644
--- a/icu4c/source/test/intltest/itrbbi.cpp
+++ b/icu4c/source/test/intltest/itrbbi.cpp
@@ -1,6 +1,6 @@
 /*
 **********************************************************************
-* Copyright (C) 1998-2001, International Business Machines Corporation 
+* Copyright (C) 1998-2011, International Business Machines Corporation 
 * and others.  All Rights Reserved.
 **********************************************************************
 */
@@ -19,28 +19,27 @@
 #include "itrbbi.h"
 #include "rbbiapts.h"
 #include "rbbitst.h"
+#include "dicttest.h"
+
+#define TESTCLASS(n,classname)        \
+    case n:                           \
+        name = #classname;            \
+        if (exec) {                   \
+            logln(#classname "---");  \
+            logln("");                \
+            classname t;              \
+            callTest(t, par);         \
+        }                             \
+        break
+
 
 void IntlTestRBBI::runIndexedTest( int32_t index, UBool exec, const char* &name, char* par )
 {
     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
     switch (index) {
-        case 0:
-            name = "RBBIAPITest"; 
-            if (exec) {
-                logln("RBBIAPITest--"); logln("");
-                RBBIAPITest test;
-                callTest( test, par );
-            }
-            break;
-
-        case 1:
-           name = "RBBITest"; 
-            if (exec) {
-                logln("RBBITest---"); logln("");
-                RBBITest test;
-                callTest( test, par );
-            }
-            break;
+        TESTCLASS(0, RBBIAPITest);
+        TESTCLASS(1, RBBITest);
+        TESTCLASS(2, DictionaryWordTest);
         default: name=""; break;
     }
 }
diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp
index aed26287c13..7dfeed776aa 100644
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@@ -134,17 +134,15 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
 #if !UCONFIG_NO_FILE_IO
         case 21: name = "TestBug5775";
             if (exec) TestBug5775();                           break;
-        case 22: name = "TestThaiBreaks";
-            if (exec) TestThaiBreaks();                        break;
-        case 23: name = "TestTailoredBreaks";
+        case 22: name = "TestTailoredBreaks";
             if (exec) TestTailoredBreaks();                    break;
 #else
-        case 21: case 22: case 23: name = "skip";
+        case 21: case 22: name = "skip";
             break;
 #endif
-        case 24: name = "TestDictRules";
+        case 23: name = "TestDictRules";
             if (exec) TestDictRules();                         break;
-        case 25: name = "TestBug5532";
+        case 24: name = "TestBug5532";
             if (exec) TestBug5532();                           break;
         default: name = ""; break; //needed to end loop
     }
@@ -1810,56 +1808,6 @@ end_test:
 #endif
 }
 
-void RBBITest::TestThaiBreaks() {
-    UErrorCode status=U_ZERO_ERROR;
-    BreakIterator* b;
-    Locale locale = Locale("th");
-    int32_t p, index;
-    UChar c[]= { 
-            0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B, 
-            0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19, 
-            0x0E16, 0x0E49, 0x0E33, 0x0000
-    };
-    int32_t expectedWordResult[] = {
-            2, 3, 6, 10, 11, 15, 17, 20, 22
-    };
-    int32_t expectedLineResult[] = {
-            3, 6, 11, 15, 17, 20, 22
-    };
-
-    int32_t size = u_strlen(c);
-    UnicodeString text=UnicodeString(c);
-    
-    b = BreakIterator::createWordInstance(locale, status);
-    if (U_FAILURE(status)) {
-        errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status));
-        return;
-    }
-    b->setText(text);
-    p = index = 0;
-    while ((p=b->next())!=BreakIterator::DONE && p < size) {
-        if (p != expectedWordResult[index++]) {
-            errln("Incorrect break given by thai word break iterator. Expected: %d  Got: %d", expectedWordResult[index-1], p);
-        }
-    }
-    delete b;
-    
-    b = BreakIterator::createLineInstance(locale, status);
-    if (U_FAILURE(status)) {
-        printf("Unable to create thai line break iterator.\n");
-        return;
-    }
-    b->setText(text);
-    p = index = 0;
-    while ((p=b->next())!=BreakIterator::DONE && p < size) {
-        if (p != expectedLineResult[index++]) {
-            errln("Incorrect break given by thai line break iterator. Expected: %d  Got: %d", expectedLineResult[index-1], p);
-        }
-    }
-
-    delete b;
-}
-
 // UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX"
 // Words don't include colon or period (cldrbug #1969).
 static const char    posxWordText[]     = "Can't have breaks in xx:yy or struct.field for CS-types.";
diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h
index d46c9b59976..7effb799e78 100644
--- a/icu4c/source/test/intltest/rbbitst.h
+++ b/icu4c/source/test/intltest/rbbitst.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 1999-2010, International Business Machines
+ * Copyright (c) 1999-2011, International Business Machines
  * Corporation and others. All Rights Reserved.
  *************************************************************************
  *   Date        Name        Description
@@ -68,7 +68,6 @@ public:
     void TestTrieDict();
     void TestUnicodeFiles();
     void TestBug5775();
-    void TestThaiBreaks();
     void TestTailoredBreaks();
     void TestDictRules();
     void TestBug5532();
diff --git a/icu4c/source/test/testdata/wordsegments.txt b/icu4c/source/test/testdata/wordsegments.txt
new file mode 100644
index 00000000000..2dab96da808
--- /dev/null
+++ b/icu4c/source/test/testdata/wordsegments.txt
@@ -0,0 +1,23 @@
+# Copyright (C) 2011-2011, International Business Machines Corporation
+# and others. All Rights Reserved.
+#
+#   file name:  wordsegments.txt
+#   encoding:   UTF-8
+#
+#   created on: 2011may14
+#   created by: George Rhoten
+#   created by: Nathan Wells
+#
+# Word boundary test data for languages that contain no spaces.
+# Boundaries are deliminated with the | character so that it's easier to debug.
+#
+# If you have test data with zero width spaces to deliminate the words, use the following command example.
+# Be sure to copy the zero width space in the sed command.
+# echo 'áá¼áâáááá¶ááááâááááá·áâáá¾áááá¸âá¢áá·áááá¶áâá¢áâáááááá»áâáááâááááá¢ááá' | sed 's/â/\|/g'
+#
+
+# Thai
+à¸à¸¹| |à¸à¸´à¸|à¸à¸¸à¹à¸| |à¸à¸´à¹à¹|à¸à¸­|à¸¢à¸¹à¹|à¹à¸|à¸à¹à¸³
+
+# Khmer
+áá¼á|áááá¶áááá|ááááá·á|áá¾áááá¸|á¢áá·áááá¶á|á¢á|áááááá»á|ááá|ááááá¢ááá