ICU-9077 Enhancements to break iteration tests.

author Andy Heninger <andy.heninger@gmail.com>

Fri, 15 Feb 2013 07:17:59 +0000 (07:17 +0000)

committer Andy Heninger <andy.heninger@gmail.com>

Fri, 15 Feb 2013 07:17:59 +0000 (07:17 +0000)
author Andy Heninger <andy.heninger@gmail.com>
Fri, 15 Feb 2013 07:17:59 +0000 (07:17 +0000)
committer Andy Heninger <andy.heninger@gmail.com>
Fri, 15 Feb 2013 07:17:59 +0000 (07:17 +0000)
diff --git a/icu4c/source/data/brkitr/char.txt b/icu4c/source/data/brkitr/char.txt

index c0d9731c1993f1b829968ef72d477d9e483baa3c..abf71fcf402edc6ca05fe62c39aecb6c3ce9d039 100644 (file)
--- a/icu4c/source/data/brkitr/char.txt
+++ b/icu4c/source/data/brkitr/char.txt
@@ -1,5 +1,5 @@
  #
-#   Copyright (C) 2002-2012, International Business Machines Corporation and others.
+#   Copyright (C) 2002-2013, International Business Machines Corporation and others.
  #       All Rights Reserved.
  #
  #   file:  char.txt 
@@ -66,11 +66,15 @@ $SpacingMark [^$Control $CR $LF];
  
  
  ## -------------------------------------------------
+#  We don't logically need safe char break rules, but if we don't provide any at all
+#  the engine for preceding() and following() will fall back to the
+#  old style inefficient algorithm.
  
  !!safe_reverse;
-
+$LF $CR;
  
  ## -------------------------------------------------
  
  !!safe_forward;
+$CR $LF;
  
diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp

index 3d861bb1bcc9791c677f6a8cdfc4a3df378a60a5..4281bfab01edf916337de60246ebeec47858f646 100644 (file)
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@@ -992,10 +992,10 @@ void RBBITest::executeTest(TestParams *t) {
          int32_t expectedBreak = BreakIterator::DONE;
  
          // For supplementaries, back up to the start of the character.
-        int32_t currentCharStart = i < t->dataToBreak.length()? t->dataToBreak.getChar32Start(i) : i;
+        // int32_t currentCharStart = i < t->dataToBreak.length()? t->dataToBreak.getChar32Start(i) : i;
  
-        for (int32_t j=currentCharStart-1; j >= 0; j--) {
-        // for (int32_t j=i-1; j >= 0; j--) {
+        // for (int32_t j=currentCharStart-1; j >= 0; j--) {
+        for (int32_t j=i-1; j >= 0; j--) {
              if (t->expectedBreaks->elementAti(j) != 0) {
                  expectedBreak = j;
                  break;
diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt

index 5a240f19ed9a2e082228b67f6856dc8b2050a6ac..49f310c8f60388d5c31f0d6bbde9d93dff711f2f 100644 (file)
--- a/icu4c/source/test/testdata/rbbitst.txt
+++ b/icu4c/source/test/testdata/rbbitst.txt
@@ -33,8 +33,9 @@
  
  
  #   Temp debugging tests 
-<line>
-<data>•\ufffc•\u30e3\u000c<100>\u1b39\u300a\u002f\u203a\u200b•\ufffc•\uaf64•\udcfb•</data>
+<char>
+<data>•\U00010020•\U00010000\u0301•x•</data>
+<data>•\U00010020•\U00010000\N{COMBINING MACRON}•</data>
  
  ########################################################################################
  #
@@ -167,8 +168,7 @@
  <data>•abc\U00010300<200> •abc\N{DESERET SMALL LETTER ENG}<200> •abc\N{MATHEMATICAL BOLD SMALL Z}<200> •abc\N{MATHEMATICAL SANS-SERIF BOLD ITALIC PI SYMBOL}<200> •</data>
  
  # Unassigned code points
-# TODO: This case should pass.
-#<data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data>
+<data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data>
  
  # Hiragana & Katakana stay together, but separates from each other and Latin.
  # *** what to do about theoretical combos of chars? i.e. hiragana + accent
@@ -539,7 +539,7 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
  
  #      Surrogate line break tests.
  #
-#<data>•\u4e01•\ud840\udc01•\u4e02•abc •\ue000 •\udb80\udc01•</data> #TODO: should be same as the next line.
+<data>•\u4e01•\ud840\udc01•\u4e02•abc •\ue000 •\udb80\udc01•</data>  #This line and the following are equivalent.
  <data>•\u4e01•\U20001•\u4e02•abc •\ue000 •\Uf0001•</data>
  
  #      Regression for bug 836
@@ -819,6 +819,9 @@ Bangkok)•</data>
  
  <locale fi>
  <line>
+// TODO: problems with Finnish line break rules cause these two lines to fail.
  #<data>•abc •- •def    •abc •-def    •abc- •def   •abc-•def•</data>   # With ASCII hyphen
  #<data>•abc •‐ •def    •abc •‐def    •abc‐ •def   •abc‐•def•</data>   # With Unicode u2010 hyphen
  
+<data>•abc •- •def    •abc •-def    •abc- •def   •</data>   # With ASCII hyphen
+<data>•abc •‐ •def    •abc •‐def    •abc‐ •def   •</data>   # With Unicode u2010 hyphen
author	Andy Heninger <andy.heninger@gmail.com>
	Fri, 15 Feb 2013 07:17:59 +0000 (07:17 +0000)
committer	Andy Heninger <andy.heninger@gmail.com>
	Fri, 15 Feb 2013 07:17:59 +0000 (07:17 +0000)
icu4c/source/data/brkitr/char.txt		patch \| blob \| history
icu4c/source/test/intltest/rbbitst.cpp		patch \| blob \| history
icu4c/source/test/testdata/rbbitst.txt		patch \| blob \| history