]> granicus.if.org Git - icu/commitdiff
ICU-9601 from-UTF-8 m:n conversion: properly revert to pivoting for m:n matching
authorMarkus Scherer <markus.icu@gmail.com>
Fri, 5 Oct 2012 20:12:49 +0000 (20:12 +0000)
committerMarkus Scherer <markus.icu@gmail.com>
Fri, 5 Oct 2012 20:12:49 +0000 (20:12 +0000)
X-SVN-Rev: 32529

icu4c/source/common/ucnv.c
icu4c/source/common/ucnvmbcs.c
icu4c/source/test/testdata/Makefile.in
icu4c/source/test/testdata/conversion.txt
icu4c/source/test/testdata/test2.ucm [new file with mode: 0644]
icu4c/source/test/testdata/testdata.mak

index 3308c6c7bbdcc366e99cbb74de677bc019186224..8e5073c2e3b760cd42ce898d64d178b82238a1fd 100644 (file)
@@ -1,7 +1,7 @@
 /*
 ******************************************************************************
 *
-*   Copyright (C) 1998-2011, International Business Machines
+*   Copyright (C) 1998-2012, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************
@@ -2851,14 +2851,12 @@ ucnv_fromUCountPending(const UConverter* cnv, UErrorCode* status)
         return -1;
     }
 
-    if(cnv->preFromULength > 0){
+    if(cnv->preFromUFirstCP >= 0){
         return U16_LENGTH(cnv->preFromUFirstCP)+cnv->preFromULength ;
     }else if(cnv->preFromULength < 0){
         return -cnv->preFromULength ;
     }else if(cnv->fromUChar32 > 0){
         return 1;
-    }else if(cnv->preFromUFirstCP >0){
-        return U16_LENGTH(cnv->preFromUFirstCP);
     }
     return 0; 
 
index 159b5e5ca5abd25bbb5a70450dbe6573ab70653c..f3d83a3625e138e59a471c7454bff5e9ccd84943 100644 (file)
@@ -5122,6 +5122,7 @@ moreBytes:
                      * but then exit the loop because the extension match would
                      * have consumed the source.
                      */
+                    *pErrorCode=U_USING_DEFAULT_WARNING;
                     break;
                 } else {
                     /* a mapping was written to the target, continue */
@@ -5142,7 +5143,9 @@ moreBytes:
      * to stop before a truncated sequence.
      * If so, then collect the truncated sequence now.
      */
-    if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
+    if(U_SUCCESS(*pErrorCode) &&
+            cnv->preFromUFirstCP<0 &&
+            source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
         c=utf8->toUBytes[0]=b=*source++;
         toULength=1;
         toULimit=utf8_countTrailBytes[b]+1;
@@ -5428,6 +5431,7 @@ unassigned:
                      * but then exit the loop because the extension match would
                      * have consumed the source.
                      */
+                    *pErrorCode=U_USING_DEFAULT_WARNING;
                     break;
                 } else {
                     /* a mapping was written to the target, continue */
@@ -5449,7 +5453,9 @@ unassigned:
      * to stop before a truncated sequence.
      * If so, then collect the truncated sequence now.
      */
-    if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
+    if(U_SUCCESS(*pErrorCode) &&
+            cnv->preFromUFirstCP<0 &&
+            source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
         c=utf8->toUBytes[0]=b=*source++;
         toULength=1;
         toULimit=utf8_countTrailBytes[b]+1;
index 3baada9692d5aac3ffc47f8f653ef0984cb5d937..df5a82c0d7bcfabb63fb56e034eb832257665e53 100644 (file)
@@ -1,6 +1,6 @@
 #******************************************************************************
 #
-#   Copyright (C) 1998-2011, International Business Machines
+#   Copyright (C) 1998-2012, International Business Machines
 #   Corporation and others.  All Rights Reserved.
 #
 #******************************************************************************
@@ -129,7 +129,7 @@ TESTDT=$(TESTDATA)
 TEST_DAT_FILES=$(TESTBUILDDIR)/test.icu
 TEST_SPP_FILES=$(TESTBUILDDIR)/nfscsi.spp $(TESTBUILDDIR)/nfscss.spp $(TESTBUILDDIR)/nfscis.spp $(TESTBUILDDIR)/nfsmxs.spp $(TESTBUILDDIR)/nfsmxp.spp
 
-TEST_UCM_SOURCE= test1.ucm test1bmp.ucm test3.ucm test4.ucm test4x.ucm test5.ucm ibm9027.ucm
+TEST_UCM_SOURCE= test1.ucm test1bmp.ucm test2.ucm test3.ucm test4.ucm test4x.ucm test5.ucm ibm9027.ucm
 TEST_UCM_FILES=$(TEST_UCM_SOURCE:%=$(TESTSRCDATADIR)/data/%)
 TEST_CNV_FILES=$(TEST_UCM_SOURCE:%.ucm=$(TESTBUILDDIR)/%.cnv)
 
index f1c4cfed7350ce1fbf96f7482edf7f0dbfde0265..e9881a000f29b2b6d73b7e670e5f11152b44eb0d 100644 (file)
@@ -1808,6 +1808,21 @@ conversion:table(nofallback) {
           :intvector{ 0,0,0,0,0,1,2,2,2,3,3,3,3,3 },
           :int{1}, :int{0}, "", "0", ""
         }
+        // Bug #9601 direct-from-UTF-8 m:n Unicode:charset conversion.
+        {
+          "*test1bmp",
+          "uv",
+          :bin{       08 },
+          :intvector{ 0 },
+          :int{1}, :int{0}, "", "?", ""
+        }
+        {
+          "*test2",
+          "\U00101234\U00050005",
+          :bin{       0700010e05 },
+          :intvector{ 0,0,0,0,0 },
+          :int{1}, :int{0}, "", "?", ""
+        }
       }
     }
 
diff --git a/icu4c/source/test/testdata/test2.ucm b/icu4c/source/test/testdata/test2.ucm
new file mode 100644 (file)
index 0000000..3b950e1
--- /dev/null
@@ -0,0 +1,59 @@
+# *******************************************************************************
+# * Copyright (C) 2012, International Business Machines
+# * Corporation and others.  All Rights Reserved.
+# *******************************************************************************
+#
+# test2.ucm
+#
+# Test file for MBCS conversion with two-byte codepage data. (DBCS)
+# Also contains extension mappings (m:n).
+
+<code_set_name>     "test2"
+<mb_cur_max>        2
+<mb_cur_min>        1
+<uconv_class>       "MBCS"
+<subchar>           \x1A
+<icu:state>         0, 1:1, 5-9, 1a, ff
+<icu:state>         a-f.p
+
+CHARMAP
+
+# fromUnicode result is zero byte from other than U+0000
+<U0040>     \x00 |0
+
+# nothing special
+<U0065>     \x05 |0
+
+# extensions
+<U00c0>     \x05+\x01\x0d |0
+<U00c0>     \x05+\x01\x0e |3
+<U00c0>     \x05+\xff |3
+
+# toUnicode result is fallback direct
+<U0066>     \x06 |3
+
+# toUnicode result is direct non-BMP code point
+<U101234>   \x07 |0
+<Ufebcd>    \x08 |3
+
+# extensions
+<U101234>+<U50005>+<U60006> \x07+\x00+\x01\x0f+\x09 |0
+<U101234>+<U50005>          \x07+\x00+\x01\x0e+\x05 |0
+<U101234>+<U60006>          \x07+\x00+\x01\x0f+\x06 |0
+<U101234>+<U70007>          \x07+\x00+\x01\x0f |1
+
+#unassigned \x09
+
+# extensions where the first code point is unassigned, for replay testing
+#<U00c4><U0300> \x09+\x09 |0
+<U00c4><U00c4><U101234><U0005> \x05+\x01\x0c |0
+
+# toUnicode result is surrogate pair: test real pair, single unit, unassigned
+<U23456>    \x01\x0a |0
+<U000b>     \x01\x0b |0
+#unassigned \x01\x0c
+<U34567>    \x01\x0d |3
+<U000e>     \x01\x0e |3
+#unassigned \x01\x0f
+
+END CHARMAP
index 6d3786faeffc2f2f31b32bac57e71169a7b9503c..d38a19ff995715b245653ac43296774cea71a371 100644 (file)
@@ -1,5 +1,5 @@
 #**********************************************************************
-#* Copyright (C) 1999-2010, International Business Machines Corporation
+#* Copyright (C) 1999-2012, International Business Machines Corporation
 #* and others.  All Rights Reserved.
 #**********************************************************************
 #
@@ -28,7 +28,7 @@ ALL : "$(TESTDATAOUT)\testdata.dat"
 
 TEST_RES_FILES = $(TEST_RES_SOURCE:.txt=.res)
 
-"$(TESTDATAOUT)\testdata.dat" : $(TEST_RES_FILES) "$(TESTDATABLD)\casing.res" "$(TESTDATABLD)\conversion.res" "$(TESTDATABLD)\icuio.res" "$(TESTDATABLD)\mc.res" "$(TESTDATABLD)\structLocale.res" "$(TESTDATABLD)\root.res" "$(TESTDATABLD)\sh.res" "$(TESTDATABLD)\sh_YU.res"  "$(TESTDATABLD)\te.res" "$(TESTDATABLD)\te_IN.res" "$(TESTDATABLD)\te_IN_REVISED.res" "$(TESTDATABLD)\testaliases.res" "$(TESTDATABLD)\testtypes.res" "$(TESTDATABLD)\testempty.res" "$(TESTDATABLD)\iscii.res" "$(TESTDATABLD)\idna_rules.res" "$(TESTDATABLD)\DataDrivenCollationTest.res" "$(TESTDATABLD)\test.icu" "$(TESTDATABLD)\testtable32.res" "$(TESTDATABLD)\test1.cnv" "$(TESTDATABLD)\test1bmp.cnv" "$(TESTDATABLD)\test3.cnv" "$(TESTDATABLD)\test4.cnv" "$(TESTDATABLD)\test4x.cnv" "$(TESTDATABLD)\test5.cnv" "$(TESTDATABLD)\ibm9027.cnv" "$(TESTDATABLD)\nfscsi.spp" "$(TESTDATABLD)\nfscss.spp" "$(TESTDATABLD)\nfscis.spp" "$(TESTDATABLD)\nfsmxs.spp" "$(TESTDATABLD)\nfsmxp.spp" "$(TESTDATABLD)\testnorm.nrm"
+"$(TESTDATAOUT)\testdata.dat" : $(TEST_RES_FILES) "$(TESTDATABLD)\casing.res" "$(TESTDATABLD)\conversion.res" "$(TESTDATABLD)\icuio.res" "$(TESTDATABLD)\mc.res" "$(TESTDATABLD)\structLocale.res" "$(TESTDATABLD)\root.res" "$(TESTDATABLD)\sh.res" "$(TESTDATABLD)\sh_YU.res"  "$(TESTDATABLD)\te.res" "$(TESTDATABLD)\te_IN.res" "$(TESTDATABLD)\te_IN_REVISED.res" "$(TESTDATABLD)\testaliases.res" "$(TESTDATABLD)\testtypes.res" "$(TESTDATABLD)\testempty.res" "$(TESTDATABLD)\iscii.res" "$(TESTDATABLD)\idna_rules.res" "$(TESTDATABLD)\DataDrivenCollationTest.res" "$(TESTDATABLD)\test.icu" "$(TESTDATABLD)\testtable32.res" "$(TESTDATABLD)\test1.cnv" "$(TESTDATABLD)\test1bmp.cnv" "$(TESTDATABLD)\test2.cnv" "$(TESTDATABLD)\test3.cnv" "$(TESTDATABLD)\test4.cnv" "$(TESTDATABLD)\test4x.cnv" "$(TESTDATABLD)\test5.cnv" "$(TESTDATABLD)\ibm9027.cnv" "$(TESTDATABLD)\nfscsi.spp" "$(TESTDATABLD)\nfscss.spp" "$(TESTDATABLD)\nfscis.spp" "$(TESTDATABLD)\nfsmxs.spp" "$(TESTDATABLD)\nfsmxp.spp" "$(TESTDATABLD)\testnorm.nrm"
        @echo Building test data
        @copy "$(TESTDATABLD)\te.res" "$(TESTDATAOUT)\$(TESTDT)\nam.typ"
        @copy "$(TESTDATA)\old_l_testtypes.res" "$(TESTDATABLD)"
@@ -55,6 +55,7 @@ iscii.res
 test.icu
 test1.cnv
 test1bmp.cnv
+test2.cnv
 test3.cnv
 test4.cnv
 test4x.cnv
@@ -133,6 +134,10 @@ $(TEST_RES_FILES:.res =.res
        @echo Building $@
        @"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" --small -d"$(TESTDATABLD)" $**
 
+"$(TESTDATABLD)\test2.cnv": "$(TESTDATA)\test2.ucm"
+       @echo Building $@
+       @"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" --small -d"$(TESTDATABLD)" $**
+
 "$(TESTDATABLD)\test3.cnv": "$(TESTDATA)\test3.ucm"
        @echo Building $@
        @"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" --small -d"$(TESTDATABLD)" $**