From c68b5d9d38e9e09640f00db1ae890a8e2b89ade0 Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Fri, 15 Feb 2013 07:17:59 +0000 Subject: [PATCH] ICU-9077 Enhancements to break iteration tests. X-SVN-Rev: 33233 --- icu4c/source/data/brkitr/char.txt | 8 ++++++-- icu4c/source/test/intltest/rbbitst.cpp | 6 +++--- icu4c/source/test/testdata/rbbitst.txt | 13 ++++++++----- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/icu4c/source/data/brkitr/char.txt b/icu4c/source/data/brkitr/char.txt index c0d9731c199..abf71fcf402 100644 --- a/icu4c/source/data/brkitr/char.txt +++ b/icu4c/source/data/brkitr/char.txt @@ -1,5 +1,5 @@ # -# Copyright (C) 2002-2012, International Business Machines Corporation and others. +# Copyright (C) 2002-2013, International Business Machines Corporation and others. # All Rights Reserved. # # file: char.txt @@ -66,11 +66,15 @@ $SpacingMark [^$Control $CR $LF]; ## ------------------------------------------------- +# We don't logically need safe char break rules, but if we don't provide any at all +# the engine for preceding() and following() will fall back to the +# old style inefficient algorithm. !!safe_reverse; - +$LF $CR; ## ------------------------------------------------- !!safe_forward; +$CR $LF; diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 3d861bb1bcc..4281bfab01e 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -992,10 +992,10 @@ void RBBITest::executeTest(TestParams *t) { int32_t expectedBreak = BreakIterator::DONE; // For supplementaries, back up to the start of the character. - int32_t currentCharStart = i < t->dataToBreak.length()? t->dataToBreak.getChar32Start(i) : i; + // int32_t currentCharStart = i < t->dataToBreak.length()? t->dataToBreak.getChar32Start(i) : i; - for (int32_t j=currentCharStart-1; j >= 0; j--) { - // for (int32_t j=i-1; j >= 0; j--) { + // for (int32_t j=currentCharStart-1; j >= 0; j--) { + for (int32_t j=i-1; j >= 0; j--) { if (t->expectedBreaks->elementAti(j) != 0) { expectedBreak = j; break; diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 5a240f19ed9..49f310c8f60 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -33,8 +33,9 @@ # Temp debugging tests - -•\ufffc•\u30e3\u000c<100>\u1b39\u300a\u002f\u203a\u200b•\ufffc•\uaf64•\udcfb• + +•\U00010020•\U00010000\u0301•x• +•\U00010020•\U00010000\N{COMBINING MACRON}• ######################################################################################## # @@ -167,8 +168,7 @@ •abc\U00010300<200> •abc\N{DESERET SMALL LETTER ENG}<200> •abc\N{MATHEMATICAL BOLD SMALL Z}<200> •abc\N{MATHEMATICAL SANS-SERIF BOLD ITALIC PI SYMBOL}<200> • # Unassigned code points -# TODO: This case should pass. -#•abc<200>\U0001D800•def<200>\U0001D3FF• • +•abc<200>\U0001D800•def<200>\U0001D3FF• • # Hiragana & Katakana stay together, but separates from each other and Latin. # *** what to do about theoretical combos of chars? i.e. hiragana + accent @@ -539,7 +539,7 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal # Surrogate line break tests. # -#•\u4e01•\ud840\udc01•\u4e02•abc •\ue000 •\udb80\udc01• #TODO: should be same as the next line. +•\u4e01•\ud840\udc01•\u4e02•abc •\ue000 •\udb80\udc01• #This line and the following are equivalent. •\u4e01•\U20001•\u4e02•abc •\ue000 •\Uf0001• # Regression for bug 836 @@ -819,6 +819,9 @@ Bangkok)• +// TODO: problems with Finnish line break rules cause these two lines to fail. #•abc •- •def •abc •-def •abc- •def •abc-•def• # With ASCII hyphen #•abc •‐ •def •abc •‐def •abc‐ •def •abc‐•def• # With Unicode u2010 hyphen +•abc •- •def •abc •-def •abc- •def • # With ASCII hyphen +•abc •‐ •def •abc •‐def •abc‐ •def • # With Unicode u2010 hyphen -- 2.40.0