]> granicus.if.org Git - icu/commitdiff
ICU-12079 Full width digits become numeric for word break.
authorAndy Heninger <andy.heninger@gmail.com>
Tue, 4 Dec 2018 01:34:38 +0000 (17:34 -0800)
committerAndy Heninger <andy.heninger@gmail.com>
Wed, 19 Dec 2018 00:36:06 +0000 (16:36 -0800)
icu4c/source/data/brkitr/rules/word.txt
icu4c/source/data/brkitr/rules/word_POSIX.txt
icu4c/source/test/intltest/rbbitst.cpp
icu4c/source/test/testdata/break_rules/word.txt
icu4c/source/test/testdata/break_rules/word_POSIX.txt
icu4c/source/test/testdata/rbbitst.txt
icu4j/main/shared/data/icudata.jar
icu4j/main/shared/data/icutzdata.jar
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/word.txt
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/word_POSIX.txt

index 3066922be1da96a7e48c90d3728745feca7d826a..3027574d25d5602ea73e05001e2106055e7e8b76 100644 (file)
@@ -44,7 +44,7 @@ $Double_Quote       = [\p{Word_Break = Double_Quote}];
 $MidNumLet          = [\p{Word_Break = MidNumLet}];
 $MidLetter          = [\p{Word_Break = MidLetter}];
 $MidNum             = [\p{Word_Break = MidNum}];
-$Numeric            = [\p{Word_Break = Numeric}];
+$Numeric            = [[\p{Word_Break = Numeric}] [\uFF10-\uff19]];  # Patch for ICU-12079
 $ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
 $WSegSpace          = [\p{Word_Break = WSegSpace}];
 $Extended_Pict      = [:ExtPict:];
index 79126931ead2fbc5ef31d60376c67ed13514fa44..bcf127a42aa7af7ded502e69f0d6127215305377 100644 (file)
@@ -44,7 +44,7 @@ $Double_Quote       = [\p{Word_Break = Double_Quote}];
 $MidNumLet          = [\p{Word_Break = MidNumLet} - [.]];
 $MidLetter          = [\p{Word_Break = MidLetter} - [\:]];
 $MidNum             = [\p{Word_Break = MidNum} [.]];
-$Numeric            = [\p{Word_Break = Numeric}];
+$Numeric            = [[\p{Word_Break = Numeric}] [\uFF10-\uff19]];  # Patch for ICU-12079
 $ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
 $WSegSpace          = [\p{Word_Break = WSegSpace}];
 $Extended_Pict      = [:ExtPict:];
index 5467a96b49950cc432e2f12846134eb11ddb05d7..3d1c0a8340df9100ca89b2ec27164d4c73aa0c0c 100644 (file)
@@ -1872,7 +1872,7 @@ RBBIWordMonkey::RBBIWordMonkey()
     fMidNumLetSet     = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]",    status);
     fMidLetterSet     = new UnicodeSet(u"[\\p{Word_Break = MidLetter}]",    status);
     fMidNumSet        = new UnicodeSet(u"[\\p{Word_Break = MidNum}]",       status);
-    fNumericSet       = new UnicodeSet(u"[\\p{Word_Break = Numeric}]",      status);
+    fNumericSet       = new UnicodeSet(u"[[\\p{Word_Break = Numeric}][\\uff10-\\uff19]]", status);
     fFormatSet        = new UnicodeSet(u"[\\p{Word_Break = Format}]",       status);
     fExtendNumLetSet  = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
     fExtendSet        = new UnicodeSet(u"[\\p{Word_Break = Extend}]",       status);
index fc7bc9b188655f04fbc0d96f0c8655c440ef3642..9b3e527ee72c41c93e1c9946bbbfd0172d7894dd 100644 (file)
@@ -30,7 +30,7 @@ Double_Quote       = [\p{Word_Break = Double_Quote}];
 MidNumLet          = [\p{Word_Break = MidNumLet}];
 MidLetter          = [\p{Word_Break = MidLetter}];
 MidNum             = [\p{Word_Break = MidNum}];
-Numeric            = [\p{Word_Break = Numeric}];
+Numeric            = [[\p{Word_Break = Numeric}] [\uFF10-\uff19]];  # Patch for ICU-12079;
 ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
 WSegSpace          = [\p{Word_Break = WSegSpace}];
 Extended_Pict      = [:ExtPict:];
index 10efc32d2103d445d690ec201a36eb22cb3d2cf7..04bcb321ae93cee894c4fadb26ae54ee9f0d0e41 100644 (file)
@@ -29,7 +29,7 @@ Double_Quote       = [\p{Word_Break = Double_Quote}];
 MidNumLet          = [\p{Word_Break = MidNumLet} - [.]];
 MidLetter          = [\p{Word_Break = MidLetter} - [\:]];
 MidNum             = [\p{Word_Break = MidNum} [.]];
-Numeric            = [\p{Word_Break = Numeric}];
+Numeric            = [[\p{Word_Break = Numeric}] [\uFF10-\uff19]];  # Patch for ICU-12079;
 ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
 WSegSpace          = [\p{Word_Break = WSegSpace}];
 Extended_Pict      = [:ExtPict:];
index 63ba172233d857844b2a2c833369670d907466f8..e9f2a32099a334b1022cc8543969bc5bf32c4c66 100644 (file)
 <data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data>
 
 # fullwidth numeric, midletter characters etc should be treated like their halfwidth counterparts
-# <data>•ISN'T<200> •19<100>日<400></data>
-# why was this added with the dbbi stuff?
+<data>•ISN'T<200> •19<100>日<400></data>
 
 #      to test for bug #4098467
 #      What follows is a string of Korean characters (I found it in the Yellow Pages
index f34485f69f0908fa746d314475f8797595da694c..1e5b5cb60676fdd1b26ef80eb96edc5b11b08805 100644 (file)
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:70f6389c315809aac32ac3145b5fe813e59de7b8ac13f4a8ead60f58c443cca5
-size 12697817
+oid sha256:1b8bb0208f9fd791029d55f17dd9722d7b4062f5478e55c28722cc7188435507
+size 12690372
index 1f0a57452ecef249ac99010c0305978f16901d83..829821222ec797a858c8135f62cff7b1690792ce 100644 (file)
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f810eeeeb46325f0b314b2ea0d971ee7c3c850e6160660651aff0be8aaaa27f7
+oid sha256:99a35b2f985a8a281b8474cc074195d23bdc6757ea561d082b5fb94a2d749cb2
 size 92787
index 7b5803264c8d991d6496e1c907c55edca913f8c9..9a690f9375da06a3e24d4cd2a589ce18d7baecb3 100644 (file)
@@ -330,7 +330,7 @@ public class RBBITestMonkey extends TestFmwk {
             fMidNumLetSet    = new UnicodeSet("[\\p{Word_Break = MidNumLet}]");
             fMidLetterSet    = new UnicodeSet("[\\p{Word_Break = MidLetter}]");
             fMidNumSet       = new UnicodeSet("[\\p{Word_Break = MidNum}]");
-            fNumericSet      = new UnicodeSet("[\\p{Word_Break = Numeric}]");
+            fNumericSet      = new UnicodeSet("[[\\p{Word_Break = Numeric}][\\uFF10-\\uff19]]");
             fFormatSet       = new UnicodeSet("[\\p{Word_Break = Format}]");
             fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]");
             fExtendSet       = new UnicodeSet("[\\p{Word_Break = Extend}]");
index fc7bc9b188655f04fbc0d96f0c8655c440ef3642..9b3e527ee72c41c93e1c9946bbbfd0172d7894dd 100644 (file)
@@ -30,7 +30,7 @@ Double_Quote       = [\p{Word_Break = Double_Quote}];
 MidNumLet          = [\p{Word_Break = MidNumLet}];
 MidLetter          = [\p{Word_Break = MidLetter}];
 MidNum             = [\p{Word_Break = MidNum}];
-Numeric            = [\p{Word_Break = Numeric}];
+Numeric            = [[\p{Word_Break = Numeric}] [\uFF10-\uff19]];  # Patch for ICU-12079;
 ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
 WSegSpace          = [\p{Word_Break = WSegSpace}];
 Extended_Pict      = [:ExtPict:];
index 10efc32d2103d445d690ec201a36eb22cb3d2cf7..04bcb321ae93cee894c4fadb26ae54ee9f0d0e41 100644 (file)
@@ -29,7 +29,7 @@ Double_Quote       = [\p{Word_Break = Double_Quote}];
 MidNumLet          = [\p{Word_Break = MidNumLet} - [.]];
 MidLetter          = [\p{Word_Break = MidLetter} - [\:]];
 MidNum             = [\p{Word_Break = MidNum} [.]];
-Numeric            = [\p{Word_Break = Numeric}];
+Numeric            = [[\p{Word_Break = Numeric}] [\uFF10-\uff19]];  # Patch for ICU-12079;
 ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
 WSegSpace          = [\p{Word_Break = WSegSpace}];
 Extended_Pict      = [:ExtPict:];