ICU-4833 Update RBBI title case rules, replace obsolete rule syntax.

author Andy Heninger <andy.heninger@gmail.com>

Tue, 4 Mar 2014 19:58:04 +0000 (19:58 +0000)

committer Andy Heninger <andy.heninger@gmail.com>

Tue, 4 Mar 2014 19:58:04 +0000 (19:58 +0000)
author Andy Heninger <andy.heninger@gmail.com>
Tue, 4 Mar 2014 19:58:04 +0000 (19:58 +0000)
committer Andy Heninger <andy.heninger@gmail.com>
Tue, 4 Mar 2014 19:58:04 +0000 (19:58 +0000)
diff --git a/icu4c/source/data/brkitr/title.txt b/icu4c/source/data/brkitr/title.txt

index e3f6566c90139d53c2175bafa6dfc7f9abf01f4a..30c1c40d45b1d602cfd669702eb6990c5fbed42d 100644 (file)
--- a/icu4c/source/data/brkitr/title.txt
+++ b/icu4c/source/data/brkitr/title.txt
@@ -1,32 +1,53 @@
-# Copyright (c) 2002-2003, International Business Machines Corporation and
+# Copyright (c) 2002-2014, International Business Machines Corporation and
  # others. All Rights Reserved.
  #
  #  Title Casing Break Rules
  #
  
+
  $CaseIgnorable   = [[:Mn:][:Me:][:Cf:][:Lm:][:Sk:] \u0027 \u00AD \u2019];
  $Cased           = [[:Upper_Case:][:Lower_Case:][:Lt:]  - $CaseIgnorable];
-$NotCased        = [^ $Cased];
+$NotCased        = [[^ $Cased] - $CaseIgnorable];
  
-#
-#  If the iterator was not stopped on a cased character, advance it to the first cased char
-#
-$NotCased+;
+!!forward;
+
+#  If the iterator begins on a CaseIgnorable, advance it past it/them.
+#  This can occur at the start-of-text, or after application of the
+#  safe-reverse rule.
+
+($CaseIgnorable | $NotCased)*;
+
+#   Normal exact forward rule: beginning at the start of a word
+#         (at a cased character), advance through the word and through
+#         the uncased characters following the word.
+
+$Cased ($Cased | $CaseIgnorable)* ($NotCased | $CaseIgnorable)*;
  
-#
-#  If the iterator starts on a cased item, advance through all adjacent cased items plus
-#    any non-cased stuff, to reach the start of the next word.
-#
-$Cased ($Cased | $CaseIgnorable)* $NotCased*;
  
-#
  #  Reverse Rules
-#
+!!reverse;
  
-! $NotCased+;
+#  Normal Rule, will work nearly universally, so long as there is a
+#    start-of-word preceding the current iteration position.
  
-#
-#  If the iterator starts on a cased item, advance through all adjacent cased items plus
-#    any non-cased stuff, to reach the start of the next word.
-#
-! $NotCased* ($Cased | $CaseIgnorable)* $Cased;
-\ No newline at end of file
+($NotCased | $CaseIgnorable)* ($Cased | $CaseIgnorable)* $Cased;
+
+#  Short rule, will be effective only when moving to the start of text,
+#    with no word (cased character) preceding the current iteration position.
+
+($NotCased | $CaseIgnorable)*;
+
+!!safe_reverse;
+
+# Safe Reverse: the exact forward rule must not start in the middle
+#  of a word, so the safe reverse skips over any Cased characters,
+#  leaving it just before the start of a word.
+
+($Cased | $CaseIgnorable)*;
+
+!!safe_forward;
+
+# Safe Forward, nothing needs to be done, the exact Reverse rules will
+#   always find valid boundaries from any starting position.
+#   Still, some rule is needed, so '.', a one character movement.
+.;
diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt

index b6c04ded6d3dea284e163db5c86d27bd4a1a9829..e5df1e115b897533c0815f1b625b97d379cf31cb 100644 (file)
--- a/icu4c/source/test/testdata/rbbitst.txt
+++ b/icu4c/source/test/testdata/rbbitst.txt
@@ -593,7 +593,9 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
  <data>•123  •Start •with •a •number.•</data>
  
  <data>•'•start •with •a •case-•ignorable •cha'r'a'cter•</data>
-
+<data>•'  '' •start •with •case-•ignorable & •case-•insensitive •cha'r'a'cter•</data>
+<data>•  ''•aaa' •bbb '•ccc' '•ddd''' '''•eee   '''•fff'''   •ggg  ''•</data>
+# Note: apostrophe is case-ignorable. space is not cased.
  
  ##########################################################################################
  #
author	Andy Heninger <andy.heninger@gmail.com>
	Tue, 4 Mar 2014 19:58:04 +0000 (19:58 +0000)
committer	Andy Heninger <andy.heninger@gmail.com>
	Tue, 4 Mar 2014 19:58:04 +0000 (19:58 +0000)
icu4c/source/data/brkitr/title.txt		patch \| blob \| history
icu4c/source/test/testdata/rbbitst.txt		patch \| blob \| history