ICU-9600 sync up word.txt and word_POSIX.txt

author Andy Heninger <andy.heninger@gmail.com>

Fri, 28 Sep 2012 21:31:14 +0000 (21:31 +0000)

committer Andy Heninger <andy.heninger@gmail.com>

Fri, 28 Sep 2012 21:31:14 +0000 (21:31 +0000)
author Andy Heninger <andy.heninger@gmail.com>
Fri, 28 Sep 2012 21:31:14 +0000 (21:31 +0000)
committer Andy Heninger <andy.heninger@gmail.com>
Fri, 28 Sep 2012 21:31:14 +0000 (21:31 +0000)
diff --git a/icu4c/source/data/brkitr/word.txt b/icu4c/source/data/brkitr/word.txt

index ba995ad8f1e3cc0bc6984d2cfa819e504d3d2a82..7db43119e8e9e80062e45bb07ac091a91180dbac 100644 (file)
--- a/icu4c/source/data/brkitr/word.txt
+++ b/icu4c/source/data/brkitr/word.txt
@@ -9,7 +9,7 @@
  #      These rules are based on UAX #29 Revision 20 for Unicode Version 6.2
  #
  # Note:  Updates to word.txt will usually need to be merged into
-#        word_POSIX.txt and word_ja.txt also.
+#        word_POSIX.txt also.
  
  ##############################################################################
  #
diff --git a/icu4c/source/data/brkitr/word_POSIX.txt b/icu4c/source/data/brkitr/word_POSIX.txt

index 8938070b5ae18eb125f68b2262d993498c2e43ab..7d36481beb012d871cc7671ac3986fe223b558bb 100644 (file)
--- a/icu4c/source/data/brkitr/word_POSIX.txt
+++ b/icu4c/source/data/brkitr/word_POSIX.txt
@@ -9,7 +9,7 @@
  #      These rules are based on UAX #29 Revision 20 for Unicode Version 6.2
  #
  # Note:  Updates to word.txt will usually need to be merged into
-#        word_POSIX.txt and word_ja.txt also.
+#        word_POSIX.txt also.
  
  ##############################################################################
  #
@@ -29,14 +29,16 @@ $LF           = [\p{Word_Break = LF}];
  $Newline      = [\p{Word_Break = Newline}];
  $Extend       = [\p{Word_Break = Extend}];
  $Format       = [\p{Word_Break = Format}];
+$Hiragana     = [:Hiragana:];
  $Katakana     = [\p{Word_Break = Katakana}];
+$Han          = [:Han:];
  $ALetter      = [\p{Word_Break = ALetter}];
  $MidNumLet    = [\p{Word_Break = MidNumLet} - [.]];
  $MidLetter    = [\p{Word_Break = MidLetter} - [\:]];
  $MidNum       = [\p{Word_Break = MidNum} [.]];
  $Numeric      = [\p{Word_Break = Numeric}];
-$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
  $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
+$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
  
  
  #   Dictionary character set, for triggering language-based break engines. Currently
@@ -44,15 +46,22 @@ $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
  #   5.0 or later as the definition of Complex_Context was corrected to include all
  #   characters requiring dictionary break.
  
-$dictionary   = [:LineBreak = Complex_Context:];
  $Control        = [\p{Grapheme_Cluster_Break = Control}]; 
-$ALetterPlus  = [$ALetter [$dictionary-$Extend-$Control]];   # Note:  default ALetter does not
-                                                             #  include the dictionary characters.
+$HangulSyllable = [\uac00-\ud7a3];
+$ComplexContext = [:LineBreak = Complex_Context:];
+$KanaKanji      = [$Han $Hiragana $Katakana];
+$dictionaryCJK  = [$KanaKanji $HangulSyllable];
+$dictionary     = [$ComplexContext $dictionaryCJK];
+
+# leave CJK scripts out of ALetterPlus
+$ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
+
  
  #
  #  Rules 4    Ignore Format and Extend characters, 
  #             except when they appear at the beginning of a region of text.
  #
+# TODO: check if handling of katakana in dictionary makes rules incorrect/void
  $KatakanaEx     = $Katakana     ($Extend |  $Format)*;
  $ALetterEx      = $ALetterPlus  ($Extend |  $Format)*;
  $MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
@@ -62,7 +71,6 @@ $NumericEx      = $Numeric      ($Extend |  $Format)*;
  $ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
  $Regional_IndicatorEx = $Regional_Indicator ($Extend |  $Format)*;
  
-$Hiragana       = [\p{script=Hiragana}];
  $Ideographic    = [\p{Ideographic}];
  $HiraganaEx     = $Hiragana     ($Extend |  $Format)*;
  $IdeographicEx  = $Ideographic  ($Extend |  $Format)*;
@@ -80,13 +88,14 @@ $CR $LF;
  #          of a region of Text.   The rule here comes into play when the start of text
  #          begins with a group of Format chars, or with a "word" consisting of a single
  #          char that is not in any of the listed word break categories followed by
-#          format char(s).
-[^$CR $LF $Newline]? ($Extend |  $Format)+;
+#          format char(s), or is not a CJK dictionary character.
+[^$CR $LF $Newline $dictionaryCJK]? ($Extend |  $Format)+;
  
  $NumericEx {100};
  $ALetterEx {200};
-$KatakanaEx {300};       # note:  these status values override those from rule 5
-$HiraganaEx {300};       #        by virtual of being numerically larger.
+$HangulSyllable {200};
+$KatakanaEx {400};       # note:  these status values override those from rule 5
+$HiraganaEx {400};       #        by virtue of being numerically larger.
  $IdeographicEx {400};    #
  
  #
@@ -115,24 +124,29 @@ $NumericEx $ALetterEx {200};
  $NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
  
  # rule 13
-
-$KatakanaEx  $KatakanaEx {300};
+# to be consistent with $KanaKanji $KanaKanhi, changed
+# from 300 to 400.
+# See also TestRuleStatus in intltest/rbbiapts.cpp
+$KatakanaEx  $KatakanaEx {400};
  
  # rule 13a/b
  
  $ALetterEx      $ExtendNumLetEx {200};    #  (13a)
  $NumericEx      $ExtendNumLetEx {100};    #  (13a)
-$KatakanaEx     $ExtendNumLetEx {300};    #  (13a)
+$KatakanaEx     $ExtendNumLetEx {400};    #  (13a)
  $ExtendNumLetEx $ExtendNumLetEx {200};    #  (13a)
  
  $ExtendNumLetEx $ALetterEx  {200};    #  (13b)
  $ExtendNumLetEx $NumericEx  {100};    #  (13b)
-$ExtendNumLetEx $KatakanaEx {300};    #  (13b)
- 
+$ExtendNumLetEx $KatakanaEx {400};    #  (13b)
+
  # rule 13c
  
  $Regional_IndicatorEx $Regional_IndicatorEx;
  
+# special handling for CJK characters: chain for later dictionary segmentation
+$HangulSyllable $HangulSyllable {200};
+$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found 
  
  
  ## -------------------------------------------------
@@ -145,6 +159,7 @@ $BackNumericEx            = ($Format | $Extend)* $Numeric;
  $BackMidNumEx             = ($Format | $Extend)* $MidNum;
  $BackMidLetterEx          = ($Format | $Extend)* $MidLetter;
  $BackKatakanaEx           = ($Format | $Extend)* $Katakana;
+$BackHiraganaEx           = ($Format | $Extend)* $Hiragana;
  $BackExtendNumLetEx       = ($Format | $Extend)* $ExtendNumLet;
  $BackRegional_IndicatorEx = ($Format | $Extend)* $Regional_Indicator;
  
@@ -152,7 +167,7 @@ $BackRegional_IndicatorEx = ($Format | $Extend)* $Regional_Indicator;
  $LF $CR;
  
  # rule 4
-($Format | $Extend)*  [^$CR $LF $Newline]?;
+($Format | $Extend)*  [^$CR $LF $Newline $dictionaryCJK]?;
  
  # rule 5
  
@@ -192,6 +207,10 @@ $BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackEx
  
  $BackRegional_IndicatorEx $BackRegional_IndicatorEx;
  
+# special handling for CJK characters: chain for later dictionary segmentation
+$HangulSyllable $HangulSyllable;
+$KanaKanji $KanaKanji; #different rule status if both kanji and kana found
+
  ## -------------------------------------------------
  
  !!safe_reverse;
author	Andy Heninger <andy.heninger@gmail.com>
	Fri, 28 Sep 2012 21:31:14 +0000 (21:31 +0000)
committer	Andy Heninger <andy.heninger@gmail.com>
	Fri, 28 Sep 2012 21:31:14 +0000 (21:31 +0000)
icu4c/source/data/brkitr/word.txt		patch \| blob \| history
icu4c/source/data/brkitr/word_POSIX.txt		patch \| blob \| history