ICU-8576 Dictionary break test updates from George Rhoten

author Peter Edberg <pedberg@unicode.org>

Wed, 13 Jul 2011 06:49:42 +0000 (06:49 +0000)

committer Peter Edberg <pedberg@unicode.org>

Wed, 13 Jul 2011 06:49:42 +0000 (06:49 +0000)
author Peter Edberg <pedberg@unicode.org>
Wed, 13 Jul 2011 06:49:42 +0000 (06:49 +0000)
committer Peter Edberg <pedberg@unicode.org>
Wed, 13 Jul 2011 06:49:42 +0000 (06:49 +0000)
diff --git a/icu4c/source/test/intltest/Makefile.in b/icu4c/source/test/intltest/Makefile.in

index e603509682e7b813badfa3808181cb0b0f3f24b1..fb1e67a02bfd51badeb6ac8da2fc84b69cfb5084 100644 (file)
--- a/icu4c/source/test/intltest/Makefile.in
+++ b/icu4c/source/test/intltest/Makefile.in
@@ -51,7 +51,7 @@ tfsmalls.o tmsgfmt.o trcoll.o tscoll.o tsdate.o tsdcfmsy.o tsdtfmsy.o \
  tsmthred.o tsnmfmt.o tsputil.o tstnrapi.o tstnorm.o tzbdtest.o         \
  tzregts.o tztest.o ucdtest.o usettest.o ustrtest.o strcase.o transtst.o strtest.o thcoll.o \
  bytestrietest.o ucharstrietest.o \
-itrbbi.o rbbiapts.o rbbitst.o ittrans.o transapi.o cpdtrtst.o \
+itrbbi.o rbbiapts.o dicttest.o rbbitst.o ittrans.o transapi.o cpdtrtst.o \
  testutil.o transrt.o trnserr.o normconf.o sfwdchit.o \
  jamotest.o srchtest.o reptest.o regextst.o \
  itrbnf.o itrbnfrt.o itrbnfp.o ucaconf.o icusvtst.o \
diff --git a/icu4c/source/test/intltest/dicttest.cpp b/icu4c/source/test/intltest/dicttest.cpp

new file mode 100644 (file)

index 0000000..646084a
--- /dev/null
+++ b/icu4c/source/test/intltest/dicttest.cpp
@@ -0,0 +1,140 @@
+/*
+**********************************************************************
+* Copyright (C) 2011-2011, International Business Machines Corporation 
+* and others.  All Rights Reserved.
+**********************************************************************
+************************************************************************
+*   Date          Name        Description
+*   05/14/2011    grhoten     Creation.
+************************************************************************/
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+#include "dicttest.h"
+#include "textfile.h"
+#include "uvector.h"
+#include "unicode/rbbi.h"
+
+void DictionaryWordTest::TestThaiBreaks() {
+    UErrorCode status=U_ZERO_ERROR;
+    BreakIterator* b;
+    Locale locale = Locale("th");
+    int32_t p, index;
+    UChar c[]= { 
+            0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B, 
+            0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19, 
+            0x0E16, 0x0E49, 0x0E33, 0x0000
+    };
+    int32_t expectedWordResult[] = {
+            2, 3, 6, 10, 11, 15, 17, 20, 22
+    };
+    int32_t expectedLineResult[] = {
+            3, 6, 11, 15, 17, 20, 22
+    };
+
+    int32_t size = u_strlen(c);
+    UnicodeString text=UnicodeString(c);
+    
+    b = BreakIterator::createWordInstance(locale, status);
+    if (U_FAILURE(status)) {
+        errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status));
+        return;
+    }
+    b->setText(text);
+    p = index = 0;
+    while ((p=b->next())!=BreakIterator::DONE && p < size) {
+        if (p != expectedWordResult[index++]) {
+            errln("Incorrect break given by thai word break iterator. Expected: %d  Got: %d", expectedWordResult[index-1], p);
+        }
+    }
+    delete b;
+    
+    b = BreakIterator::createLineInstance(locale, status);
+    if (U_FAILURE(status)) {
+        printf("Unable to create thai line break iterator.\n");
+        return;
+    }
+    b->setText(text);
+    p = index = 0;
+    while ((p=b->next())!=BreakIterator::DONE && p < size) {
+        if (p != expectedLineResult[index++]) {
+            errln("Incorrect break given by thai line break iterator. Expected: %d  Got: %d", expectedLineResult[index-1], p);
+        }
+    }
+
+    delete b;
+}
+
+#define DICTIONARY_TEST_FILE "wordsegments.txt"
+
+void DictionaryWordTest::TestWordBoundaries() {
+    UErrorCode      status  = U_ZERO_ERROR;
+
+    TextFile phrases(DICTIONARY_TEST_FILE, "UTF8", status);
+    if (U_FAILURE(status)) {
+        dataerrln("Can't open "DICTIONARY_TEST_FILE": %s; skipping test",
+              u_errorName(status));
+        return;
+    }
+
+    // Due to how the word break iterator works,
+    // scripts for languages that use no spaces should use the correct dictionary by default.
+    BreakIterator *wb = BreakIterator::createWordInstance("en", status);
+    if (U_FAILURE(status)) {
+        dataerrln("Word break iterator can not be opened: %s; skipping test",
+              u_errorName(status));
+        return;
+    }
+
+    int32_t pos, pIdx;
+    int32_t testLines = 0;
+    UnicodeString phrase;
+    while (phrases.readLineSkippingComments(phrase, status, FALSE) && U_SUCCESS(status)) {
+        UVector breaks(status);
+
+        for (pIdx = 0; pIdx < phrase.length(); pIdx++) {
+            if (phrase.charAt(pIdx) == 0x007C /* | */) {
+                breaks.addElement(pIdx, status);
+                phrase.remove(pIdx, 1);
+            }
+        }
+        breaks.addElement(pIdx, status);
+
+        wb->setText(phrase);
+        int32_t brkArrPos = 0;
+        while ((pos=wb->next())!=BreakIterator::DONE) {
+            int32_t expectedPos = breaks.elementAti(brkArrPos);
+            if (expectedPos != pos) {
+                errln("Incorrect forward word break on line %d. Expected: %d  Got: %d",
+                    phrases.getLineNumber(), breaks.elementAt(brkArrPos), pos);
+            }
+            brkArrPos++;
+        }
+        brkArrPos = breaks.size() - 1;
+        while ((pos=wb->previous())!=BreakIterator::DONE) {
+            brkArrPos--;
+            int32_t expectedPos = breaks.elementAti(brkArrPos);
+            if (expectedPos != pos) {
+                errln("Incorrect backward word break on line %d. Expected: %d  Got: %d",
+                    phrases.getLineNumber(), breaks.elementAt(brkArrPos), pos);
+            }
+        }
+        testLines++;
+    }
+    delete wb;
+    logln("%d tests were run.", testLines);
+}
+
+void DictionaryWordTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par */)
+{
+    if (exec) logln("TestSuite DictionaryWordTest: ");
+    TESTCASE_AUTO_BEGIN;
+    TESTCASE_AUTO(TestThaiBreaks);
+    TESTCASE_AUTO(TestWordBoundaries);
+    TESTCASE_AUTO_END;
+}
+
+
+#endif
diff --git a/icu4c/source/test/intltest/dicttest.h b/icu4c/source/test/intltest/dicttest.h

new file mode 100644 (file)

index 0000000..ffce470
--- /dev/null
+++ b/icu4c/source/test/intltest/dicttest.h
@@ -0,0 +1,31 @@
+/*
+**********************************************************************
+* Copyright (C) 2011-2011, International Business Machines Corporation 
+* and others.  All Rights Reserved.
+**********************************************************************
+************************************************************************
+*   Date          Name        Description
+*   05/14/2011    grhoten     Creation.
+************************************************************************/
+
+#ifndef DICTTEST_H
+#define DICTTEST_H
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+#include "intltest.h"
+
+
+class DictionaryWordTest: public IntlTest {
+public:
+    void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
+    void TestWordBoundaries();
+    void TestThaiBreaks();
+};
+
+#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
+
+#endif
+
diff --git a/icu4c/source/test/intltest/intltest.vcxproj b/icu4c/source/test/intltest/intltest.vcxproj

index 52ada6a373e1e68bdef21ad933abd0c94f47958f..613d63564931417d65bb3f0da2a4758d2f000ec0 100644 (file)
--- a/icu4c/source/test/intltest/intltest.vcxproj
+++ b/icu4c/source/test/intltest/intltest.vcxproj
@@ -224,6 +224,7 @@
    </ItemDefinitionGroup>\r
    <ItemGroup>\r
      <ClCompile Include="bytestrietest.cpp" />\r
+    <ClCompile Include="dicttest.cpp" />\r
      <ClCompile Include="ucharstrietest.cpp" />\r
      <ClCompile Include="itrbbi.cpp" />\r
      <ClCompile Include="rbbiapts.cpp" />\r
@@ -389,6 +390,7 @@
      <ClCompile Include="bidiconf.cpp" />\r
    </ItemGroup>\r
    <ItemGroup>\r
+    <ClInclude Include="dicttest.h" />\r
      <ClInclude Include="itrbbi.h" />\r
      <ClInclude Include="rbbiapts.h" />\r
      <ClInclude Include="rbbitst.h" />\r
@@ -533,4 +535,4 @@
    <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />\r
    <ImportGroup Label="ExtensionTargets">\r
    </ImportGroup>\r
-</Project>
-\ No newline at end of file
+</Project>\r
diff --git a/icu4c/source/test/intltest/intltest.vcxproj.filters b/icu4c/source/test/intltest/intltest.vcxproj.filters

index d7b4159b8caa36bc69e6d105039ab4eb50e6a4c5..11738b65b5dbf60171cbd07a19beba7eddd7a0dc 100644 (file)
--- a/icu4c/source/test/intltest/intltest.vcxproj.filters
+++ b/icu4c/source/test/intltest/intltest.vcxproj.filters
@@ -444,6 +444,9 @@
      <ClCompile Include="alphaindextst.cpp">\r
        <Filter>collation</Filter>\r
      </ClCompile>\r
+    <ClCompile Include="dicttest.cpp">\r
+      <Filter>break iteration</Filter>\r
+    </ClCompile>\r
    </ItemGroup>\r
    <ItemGroup>\r
      <ClInclude Include="itrbbi.h">\r
@@ -812,5 +815,8 @@
      <ClInclude Include="alphaindextst.h">\r
        <Filter>collation</Filter>\r
      </ClInclude>\r
+    <ClInclude Include="dicttest.h">\r
+      <Filter>break iteration</Filter>\r
+    </ClInclude>\r
    </ItemGroup>\r
-</Project>
-\ No newline at end of file
+</Project>\r
diff --git a/icu4c/source/test/intltest/itrbbi.cpp b/icu4c/source/test/intltest/itrbbi.cpp

index 75f01cff0d5e9a6d747b0347bc8d3c77a1f0b99a..c6deee06c3b405701f90a18c4d5c3b287988f85e 100644 (file)
--- a/icu4c/source/test/intltest/itrbbi.cpp
+++ b/icu4c/source/test/intltest/itrbbi.cpp
@@ -1,6 +1,6 @@
  /*
  **********************************************************************
-* Copyright (C) 1998-2001, International Business Machines Corporation 
+* Copyright (C) 1998-2011, International Business Machines Corporation 
  * and others.  All Rights Reserved.
  **********************************************************************
  */
@@ -19,28 +19,27 @@
  #include "itrbbi.h"
  #include "rbbiapts.h"
  #include "rbbitst.h"
+#include "dicttest.h"
+
+#define TESTCLASS(n,classname)        \
+    case n:                           \
+        name = #classname;            \
+        if (exec) {                   \
+            logln(#classname "---");  \
+            logln("");                \
+            classname t;              \
+            callTest(t, par);         \
+        }                             \
+        break
+
  
  void IntlTestRBBI::runIndexedTest( int32_t index, UBool exec, const char* &name, char* par )
  {
      if (exec) logln("TestSuite RuleBasedBreakIterator: ");
      switch (index) {
-        case 0:
-            name = "RBBIAPITest"; 
-            if (exec) {
-                logln("RBBIAPITest--"); logln("");
-                RBBIAPITest test;
-                callTest( test, par );
-            }
-            break;
-
-        case 1:
-           name = "RBBITest"; 
-            if (exec) {
-                logln("RBBITest---"); logln("");
-                RBBITest test;
-                callTest( test, par );
-            }
-            break;
+        TESTCLASS(0, RBBIAPITest);
+        TESTCLASS(1, RBBITest);
+        TESTCLASS(2, DictionaryWordTest);
          default: name=""; break;
      }
  }
diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp

index aed26287c1369f95abca1241fc683b9c566cb256..7dfeed776aa19ca35d6afb750113e408c44ab324 100644 (file)
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@@ -134,17 +134,15 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
  #if !UCONFIG_NO_FILE_IO
          case 21: name = "TestBug5775";
              if (exec) TestBug5775();                           break;
-        case 22: name = "TestThaiBreaks";
-            if (exec) TestThaiBreaks();                        break;
-        case 23: name = "TestTailoredBreaks";
+        case 22: name = "TestTailoredBreaks";
              if (exec) TestTailoredBreaks();                    break;
  #else
-        case 21: case 22: case 23: name = "skip";
+        case 21: case 22: name = "skip";
              break;
  #endif
-        case 24: name = "TestDictRules";
+        case 23: name = "TestDictRules";
              if (exec) TestDictRules();                         break;
-        case 25: name = "TestBug5532";
+        case 24: name = "TestBug5532";
              if (exec) TestBug5532();                           break;
          default: name = ""; break; //needed to end loop
      }
@@ -1810,56 +1808,6 @@ end_test:
  #endif
  }
  
-void RBBITest::TestThaiBreaks() {
-    UErrorCode status=U_ZERO_ERROR;
-    BreakIterator* b;
-    Locale locale = Locale("th");
-    int32_t p, index;
-    UChar c[]= { 
-            0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B, 
-            0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19, 
-            0x0E16, 0x0E49, 0x0E33, 0x0000
-    };
-    int32_t expectedWordResult[] = {
-            2, 3, 6, 10, 11, 15, 17, 20, 22
-    };
-    int32_t expectedLineResult[] = {
-            3, 6, 11, 15, 17, 20, 22
-    };
-
-    int32_t size = u_strlen(c);
-    UnicodeString text=UnicodeString(c);
-    
-    b = BreakIterator::createWordInstance(locale, status);
-    if (U_FAILURE(status)) {
-        errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status));
-        return;
-    }
-    b->setText(text);
-    p = index = 0;
-    while ((p=b->next())!=BreakIterator::DONE && p < size) {
-        if (p != expectedWordResult[index++]) {
-            errln("Incorrect break given by thai word break iterator. Expected: %d  Got: %d", expectedWordResult[index-1], p);
-        }
-    }
-    delete b;
-    
-    b = BreakIterator::createLineInstance(locale, status);
-    if (U_FAILURE(status)) {
-        printf("Unable to create thai line break iterator.\n");
-        return;
-    }
-    b->setText(text);
-    p = index = 0;
-    while ((p=b->next())!=BreakIterator::DONE && p < size) {
-        if (p != expectedLineResult[index++]) {
-            errln("Incorrect break given by thai line break iterator. Expected: %d  Got: %d", expectedLineResult[index-1], p);
-        }
-    }
-
-    delete b;
-}
-
  // UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX"
  // Words don't include colon or period (cldrbug #1969).
  static const char    posxWordText[]     = "Can't have breaks in xx:yy or struct.field for CS-types.";
diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h

index d46c9b59976450433a79f189964a93e20cda0e0c..7effb799e78fd4faede3687a9c7e5f1566ebadbc 100644 (file)
--- a/icu4c/source/test/intltest/rbbitst.h
+++ b/icu4c/source/test/intltest/rbbitst.h
@@ -1,5 +1,5 @@
  /*************************************************************************
- * Copyright (c) 1999-2010, International Business Machines
+ * Copyright (c) 1999-2011, International Business Machines
   * Corporation and others. All Rights Reserved.
   *************************************************************************
   *   Date        Name        Description
@@ -68,7 +68,6 @@ public:
      void TestTrieDict();
      void TestUnicodeFiles();
      void TestBug5775();
-    void TestThaiBreaks();
      void TestTailoredBreaks();
      void TestDictRules();
      void TestBug5532();
diff --git a/icu4c/source/test/testdata/wordsegments.txt b/icu4c/source/test/testdata/wordsegments.txt

new file mode 100644 (file)

index 0000000..2dab96d
--- /dev/null
+++ b/icu4c/source/test/testdata/wordsegments.txt
@@ -0,0 +1,23 @@
+# Copyright (C) 2011-2011, International Business Machines Corporation
+# and others. All Rights Reserved.
+#
+#   file name:  wordsegments.txt
+#   encoding:   UTF-8
+#
+#   created on: 2011may14
+#   created by: George Rhoten
+#   created by: Nathan Wells
+#
+# Word boundary test data for languages that contain no spaces.
+# Boundaries are deliminated with the | character so that it's easier to debug.
+#
+# If you have test data with zero width spaces to deliminate the words, use the following command example.
+# Be sure to copy the zero width space in the sed command.
+# echo 'សូមចំណាយពេលបន្តិចដើម្បីអធិស្ឋានអរព្រះគុណដល់ព្រះអង្គ' | sed 's//\|/g'
+#
+
+# Thai
+กู| |กิน|กุ้ง| |ปิ้่|งอ|ยู่|ใน|ถ้ำ
+
+# Khmer
+សូម|ចំណាយពេល|បន្តិច|ដើម្បី|អធិស្ឋាន|អរ|ព្រះគុណ|ដល់|ព្រះអង្គ
author	Peter Edberg <pedberg@unicode.org>
	Wed, 13 Jul 2011 06:49:42 +0000 (06:49 +0000)
committer	Peter Edberg <pedberg@unicode.org>
	Wed, 13 Jul 2011 06:49:42 +0000 (06:49 +0000)
icu4c/source/test/intltest/Makefile.in		patch \| blob \| history
icu4c/source/test/intltest/dicttest.cpp	[new file with mode: 0644]	patch \| blob
icu4c/source/test/intltest/dicttest.h	[new file with mode: 0644]	patch \| blob
icu4c/source/test/intltest/intltest.vcxproj		patch \| blob \| history
icu4c/source/test/intltest/intltest.vcxproj.filters		patch \| blob \| history
icu4c/source/test/intltest/itrbbi.cpp		patch \| blob \| history
icu4c/source/test/intltest/rbbitst.cpp		patch \| blob \| history
icu4c/source/test/intltest/rbbitst.h		patch \| blob \| history
icu4c/source/test/testdata/wordsegments.txt	[new file with mode: 0644]	patch \| blob