From: Peter Edberg Date: Wed, 13 Jul 2011 06:49:42 +0000 (+0000) Subject: ICU-8576 Dictionary break test updates from George Rhoten X-Git-Tag: milestone-59-0-1~4650 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=62d26cc5fe8cc66696143c44a43d664c2d7be100;p=icu ICU-8576 Dictionary break test updates from George Rhoten X-SVN-Rev: 30327 --- diff --git a/icu4c/source/test/intltest/Makefile.in b/icu4c/source/test/intltest/Makefile.in index e603509682e..fb1e67a02bf 100644 --- a/icu4c/source/test/intltest/Makefile.in +++ b/icu4c/source/test/intltest/Makefile.in @@ -51,7 +51,7 @@ tfsmalls.o tmsgfmt.o trcoll.o tscoll.o tsdate.o tsdcfmsy.o tsdtfmsy.o \ tsmthred.o tsnmfmt.o tsputil.o tstnrapi.o tstnorm.o tzbdtest.o \ tzregts.o tztest.o ucdtest.o usettest.o ustrtest.o strcase.o transtst.o strtest.o thcoll.o \ bytestrietest.o ucharstrietest.o \ -itrbbi.o rbbiapts.o rbbitst.o ittrans.o transapi.o cpdtrtst.o \ +itrbbi.o rbbiapts.o dicttest.o rbbitst.o ittrans.o transapi.o cpdtrtst.o \ testutil.o transrt.o trnserr.o normconf.o sfwdchit.o \ jamotest.o srchtest.o reptest.o regextst.o \ itrbnf.o itrbnfrt.o itrbnfp.o ucaconf.o icusvtst.o \ diff --git a/icu4c/source/test/intltest/dicttest.cpp b/icu4c/source/test/intltest/dicttest.cpp new file mode 100644 index 00000000000..646084a20d0 --- /dev/null +++ b/icu4c/source/test/intltest/dicttest.cpp @@ -0,0 +1,140 @@ +/* +********************************************************************** +* Copyright (C) 2011-2011, International Business Machines Corporation +* and others. All Rights Reserved. +********************************************************************** +************************************************************************ +* Date Name Description +* 05/14/2011 grhoten Creation. +************************************************************************/ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_BREAK_ITERATION + +#include "dicttest.h" +#include "textfile.h" +#include "uvector.h" +#include "unicode/rbbi.h" + +void DictionaryWordTest::TestThaiBreaks() { + UErrorCode status=U_ZERO_ERROR; + BreakIterator* b; + Locale locale = Locale("th"); + int32_t p, index; + UChar c[]= { + 0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B, + 0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19, + 0x0E16, 0x0E49, 0x0E33, 0x0000 + }; + int32_t expectedWordResult[] = { + 2, 3, 6, 10, 11, 15, 17, 20, 22 + }; + int32_t expectedLineResult[] = { + 3, 6, 11, 15, 17, 20, 22 + }; + + int32_t size = u_strlen(c); + UnicodeString text=UnicodeString(c); + + b = BreakIterator::createWordInstance(locale, status); + if (U_FAILURE(status)) { + errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status)); + return; + } + b->setText(text); + p = index = 0; + while ((p=b->next())!=BreakIterator::DONE && p < size) { + if (p != expectedWordResult[index++]) { + errln("Incorrect break given by thai word break iterator. Expected: %d Got: %d", expectedWordResult[index-1], p); + } + } + delete b; + + b = BreakIterator::createLineInstance(locale, status); + if (U_FAILURE(status)) { + printf("Unable to create thai line break iterator.\n"); + return; + } + b->setText(text); + p = index = 0; + while ((p=b->next())!=BreakIterator::DONE && p < size) { + if (p != expectedLineResult[index++]) { + errln("Incorrect break given by thai line break iterator. Expected: %d Got: %d", expectedLineResult[index-1], p); + } + } + + delete b; +} + +#define DICTIONARY_TEST_FILE "wordsegments.txt" + +void DictionaryWordTest::TestWordBoundaries() { + UErrorCode status = U_ZERO_ERROR; + + TextFile phrases(DICTIONARY_TEST_FILE, "UTF8", status); + if (U_FAILURE(status)) { + dataerrln("Can't open "DICTIONARY_TEST_FILE": %s; skipping test", + u_errorName(status)); + return; + } + + // Due to how the word break iterator works, + // scripts for languages that use no spaces should use the correct dictionary by default. + BreakIterator *wb = BreakIterator::createWordInstance("en", status); + if (U_FAILURE(status)) { + dataerrln("Word break iterator can not be opened: %s; skipping test", + u_errorName(status)); + return; + } + + int32_t pos, pIdx; + int32_t testLines = 0; + UnicodeString phrase; + while (phrases.readLineSkippingComments(phrase, status, FALSE) && U_SUCCESS(status)) { + UVector breaks(status); + + for (pIdx = 0; pIdx < phrase.length(); pIdx++) { + if (phrase.charAt(pIdx) == 0x007C /* | */) { + breaks.addElement(pIdx, status); + phrase.remove(pIdx, 1); + } + } + breaks.addElement(pIdx, status); + + wb->setText(phrase); + int32_t brkArrPos = 0; + while ((pos=wb->next())!=BreakIterator::DONE) { + int32_t expectedPos = breaks.elementAti(brkArrPos); + if (expectedPos != pos) { + errln("Incorrect forward word break on line %d. Expected: %d Got: %d", + phrases.getLineNumber(), breaks.elementAt(brkArrPos), pos); + } + brkArrPos++; + } + brkArrPos = breaks.size() - 1; + while ((pos=wb->previous())!=BreakIterator::DONE) { + brkArrPos--; + int32_t expectedPos = breaks.elementAti(brkArrPos); + if (expectedPos != pos) { + errln("Incorrect backward word break on line %d. Expected: %d Got: %d", + phrases.getLineNumber(), breaks.elementAt(brkArrPos), pos); + } + } + testLines++; + } + delete wb; + logln("%d tests were run.", testLines); +} + +void DictionaryWordTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par */) +{ + if (exec) logln("TestSuite DictionaryWordTest: "); + TESTCASE_AUTO_BEGIN; + TESTCASE_AUTO(TestThaiBreaks); + TESTCASE_AUTO(TestWordBoundaries); + TESTCASE_AUTO_END; +} + + +#endif diff --git a/icu4c/source/test/intltest/dicttest.h b/icu4c/source/test/intltest/dicttest.h new file mode 100644 index 00000000000..ffce470d576 --- /dev/null +++ b/icu4c/source/test/intltest/dicttest.h @@ -0,0 +1,31 @@ +/* +********************************************************************** +* Copyright (C) 2011-2011, International Business Machines Corporation +* and others. All Rights Reserved. +********************************************************************** +************************************************************************ +* Date Name Description +* 05/14/2011 grhoten Creation. +************************************************************************/ + +#ifndef DICTTEST_H +#define DICTTEST_H + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_BREAK_ITERATION + +#include "intltest.h" + + +class DictionaryWordTest: public IntlTest { +public: + void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL ); + void TestWordBoundaries(); + void TestThaiBreaks(); +}; + +#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ + +#endif + diff --git a/icu4c/source/test/intltest/intltest.vcxproj b/icu4c/source/test/intltest/intltest.vcxproj index 52ada6a373e..613d6356493 100644 --- a/icu4c/source/test/intltest/intltest.vcxproj +++ b/icu4c/source/test/intltest/intltest.vcxproj @@ -224,6 +224,7 @@ + @@ -389,6 +390,7 @@ + @@ -533,4 +535,4 @@ - \ No newline at end of file + diff --git a/icu4c/source/test/intltest/intltest.vcxproj.filters b/icu4c/source/test/intltest/intltest.vcxproj.filters index d7b4159b8ca..11738b65b5d 100644 --- a/icu4c/source/test/intltest/intltest.vcxproj.filters +++ b/icu4c/source/test/intltest/intltest.vcxproj.filters @@ -444,6 +444,9 @@ collation + + break iteration + @@ -812,5 +815,8 @@ collation + + break iteration + - \ No newline at end of file + diff --git a/icu4c/source/test/intltest/itrbbi.cpp b/icu4c/source/test/intltest/itrbbi.cpp index 75f01cff0d5..c6deee06c3b 100644 --- a/icu4c/source/test/intltest/itrbbi.cpp +++ b/icu4c/source/test/intltest/itrbbi.cpp @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 1998-2001, International Business Machines Corporation +* Copyright (C) 1998-2011, International Business Machines Corporation * and others. All Rights Reserved. ********************************************************************** */ @@ -19,28 +19,27 @@ #include "itrbbi.h" #include "rbbiapts.h" #include "rbbitst.h" +#include "dicttest.h" + +#define TESTCLASS(n,classname) \ + case n: \ + name = #classname; \ + if (exec) { \ + logln(#classname "---"); \ + logln(""); \ + classname t; \ + callTest(t, par); \ + } \ + break + void IntlTestRBBI::runIndexedTest( int32_t index, UBool exec, const char* &name, char* par ) { if (exec) logln("TestSuite RuleBasedBreakIterator: "); switch (index) { - case 0: - name = "RBBIAPITest"; - if (exec) { - logln("RBBIAPITest--"); logln(""); - RBBIAPITest test; - callTest( test, par ); - } - break; - - case 1: - name = "RBBITest"; - if (exec) { - logln("RBBITest---"); logln(""); - RBBITest test; - callTest( test, par ); - } - break; + TESTCLASS(0, RBBIAPITest); + TESTCLASS(1, RBBITest); + TESTCLASS(2, DictionaryWordTest); default: name=""; break; } } diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index aed26287c13..7dfeed776aa 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -134,17 +134,15 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha #if !UCONFIG_NO_FILE_IO case 21: name = "TestBug5775"; if (exec) TestBug5775(); break; - case 22: name = "TestThaiBreaks"; - if (exec) TestThaiBreaks(); break; - case 23: name = "TestTailoredBreaks"; + case 22: name = "TestTailoredBreaks"; if (exec) TestTailoredBreaks(); break; #else - case 21: case 22: case 23: name = "skip"; + case 21: case 22: name = "skip"; break; #endif - case 24: name = "TestDictRules"; + case 23: name = "TestDictRules"; if (exec) TestDictRules(); break; - case 25: name = "TestBug5532"; + case 24: name = "TestBug5532"; if (exec) TestBug5532(); break; default: name = ""; break; //needed to end loop } @@ -1810,56 +1808,6 @@ end_test: #endif } -void RBBITest::TestThaiBreaks() { - UErrorCode status=U_ZERO_ERROR; - BreakIterator* b; - Locale locale = Locale("th"); - int32_t p, index; - UChar c[]= { - 0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B, - 0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19, - 0x0E16, 0x0E49, 0x0E33, 0x0000 - }; - int32_t expectedWordResult[] = { - 2, 3, 6, 10, 11, 15, 17, 20, 22 - }; - int32_t expectedLineResult[] = { - 3, 6, 11, 15, 17, 20, 22 - }; - - int32_t size = u_strlen(c); - UnicodeString text=UnicodeString(c); - - b = BreakIterator::createWordInstance(locale, status); - if (U_FAILURE(status)) { - errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status)); - return; - } - b->setText(text); - p = index = 0; - while ((p=b->next())!=BreakIterator::DONE && p < size) { - if (p != expectedWordResult[index++]) { - errln("Incorrect break given by thai word break iterator. Expected: %d Got: %d", expectedWordResult[index-1], p); - } - } - delete b; - - b = BreakIterator::createLineInstance(locale, status); - if (U_FAILURE(status)) { - printf("Unable to create thai line break iterator.\n"); - return; - } - b->setText(text); - p = index = 0; - while ((p=b->next())!=BreakIterator::DONE && p < size) { - if (p != expectedLineResult[index++]) { - errln("Incorrect break given by thai line break iterator. Expected: %d Got: %d", expectedLineResult[index-1], p); - } - } - - delete b; -} - // UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX" // Words don't include colon or period (cldrbug #1969). static const char posxWordText[] = "Can't have breaks in xx:yy or struct.field for CS-types."; diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h index d46c9b59976..7effb799e78 100644 --- a/icu4c/source/test/intltest/rbbitst.h +++ b/icu4c/source/test/intltest/rbbitst.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 1999-2010, International Business Machines + * Copyright (c) 1999-2011, International Business Machines * Corporation and others. All Rights Reserved. ************************************************************************* * Date Name Description @@ -68,7 +68,6 @@ public: void TestTrieDict(); void TestUnicodeFiles(); void TestBug5775(); - void TestThaiBreaks(); void TestTailoredBreaks(); void TestDictRules(); void TestBug5532(); diff --git a/icu4c/source/test/testdata/wordsegments.txt b/icu4c/source/test/testdata/wordsegments.txt new file mode 100644 index 00000000000..2dab96da808 --- /dev/null +++ b/icu4c/source/test/testdata/wordsegments.txt @@ -0,0 +1,23 @@ +# Copyright (C) 2011-2011, International Business Machines Corporation +# and others. All Rights Reserved. +# +# file name: wordsegments.txt +# encoding: UTF-8 +# +# created on: 2011may14 +# created by: George Rhoten +# created by: Nathan Wells +# +# Word boundary test data for languages that contain no spaces. +# Boundaries are deliminated with the | character so that it's easier to debug. +# +# If you have test data with zero width spaces to deliminate the words, use the following command example. +# Be sure to copy the zero width space in the sed command. +# echo 'សូម​ចំណាយពេល​បន្តិច​ដើម្បី​អធិស្ឋាន​អរ​ព្រះគុណ​ដល់​ព្រះអង្គ' | sed 's/​/\|/g' +# + +# Thai +กู| |กิน|กุ้ง| |ปิ้่|งอ|ยู่|ใน|ถ้ำ + +# Khmer +សូម|ចំណាយពេល|បន្តិច|ដើម្បី|អធិស្ឋាន|អរ|ព្រះគុណ|ដល់|ព្រះអង្គ