]> granicus.if.org Git - python/commitdiff
Issue #12016: Multibyte CJK decoders now resynchronize faster
authorVictor Stinner <victor.stinner@haypocalc.com>
Thu, 7 Jul 2011 23:45:13 +0000 (01:45 +0200)
committerVictor Stinner <victor.stinner@haypocalc.com>
Thu, 7 Jul 2011 23:45:13 +0000 (01:45 +0200)
They only ignore the first byte of an invalid byte sequence.

For example, b'\xff\n'.decode('gb2312', 'replace') gives '\ufffd\n' instead of
'\ufffd'.

13 files changed:
Doc/whatsnew/3.3.rst
Lib/test/test_codecencodings_cn.py
Lib/test/test_codecencodings_hk.py
Lib/test/test_codecencodings_jp.py
Lib/test/test_codecencodings_kr.py
Lib/test/test_codecencodings_tw.py
Lib/test/test_codecmaps_tw.py
Misc/NEWS
Modules/cjkcodecs/_codecs_cn.c
Modules/cjkcodecs/_codecs_hk.c
Modules/cjkcodecs/_codecs_jp.c
Modules/cjkcodecs/_codecs_kr.c
Modules/cjkcodecs/_codecs_tw.c

index e5e18051a511eaad26e0d34b396fb87beb329b03..990085e0f00a512131421dd54164527603b4c7eb 100644 (file)
@@ -68,6 +68,29 @@ New, Improved, and Deprecated Modules
 
 * Stub
 
+codecs
+------
+
+Multibyte CJK decoders now resynchronize faster. They only ignore the first
+byte of an invalid byte sequence. For example, b'\xff\n'.decode('gb2312',
+'replace') gives '�\n' instead of '�'.
+
+(http://bugs.python.org/issue12016)
+
+Don't reset incremental encoders of CJK codecs at each call to their encode()
+method anymore. For example: ::
+
+    $ ./python -q
+    >>> import codecs
+    >>> encoder = codecs.getincrementalencoder('hz')('strict')
+    >>> b''.join(encoder.encode(x) for x in '\u52ff\u65bd\u65bc\u4eba\u3002 Bye.')
+    b'~{NpJ)l6HK!#~} Bye.'
+
+This example gives b'~{Np~}~{J)~}~{l6~}~{HK~}~{!#~} Bye.' with older Python
+versions.
+
+(http://bugs.python.org/issue12100)
+
 faulthandler
 ------------
 
index dca9f10b8365daa245059d65afc5c5db8660414e..ee3d1650cb51646a077a0f8591f55279d4f64c34 100644 (file)
@@ -15,8 +15,8 @@ class Test_GB2312(test_multibytecodec_support.TestBase, unittest.TestCase):
         # invalid bytes
         (b"abc\x81\x81\xc1\xc4", "strict",  None),
         (b"abc\xc8", "strict",  None),
-        (b"abc\x81\x81\xc1\xc4", "replace", "abc\ufffd\u804a"),
-        (b"abc\x81\x81\xc1\xc4\xc8", "replace", "abc\ufffd\u804a\ufffd"),
+        (b"abc\x81\x81\xc1\xc4", "replace", "abc\ufffd\ufffd\u804a"),
+        (b"abc\x81\x81\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u804a\ufffd"),
         (b"abc\x81\x81\xc1\xc4", "ignore",  "abc\u804a"),
         (b"\xc1\x64", "strict", None),
     )
@@ -28,8 +28,8 @@ class Test_GBK(test_multibytecodec_support.TestBase, unittest.TestCase):
         # invalid bytes
         (b"abc\x80\x80\xc1\xc4", "strict",  None),
         (b"abc\xc8", "strict",  None),
-        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u804a"),
-        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u804a\ufffd"),
+        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u804a"),
+        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u804a\ufffd"),
         (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\u804a"),
         (b"\x83\x34\x83\x31", "strict", None),
         ("\u30fb", "strict", None),
@@ -42,11 +42,14 @@ class Test_GB18030(test_multibytecodec_support.TestBase, unittest.TestCase):
         # invalid bytes
         (b"abc\x80\x80\xc1\xc4", "strict",  None),
         (b"abc\xc8", "strict",  None),
-        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u804a"),
-        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u804a\ufffd"),
+        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u804a"),
+        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u804a\ufffd"),
         (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\u804a"),
-        (b"abc\x84\x39\x84\x39\xc1\xc4", "replace", "abc\ufffd\u804a"),
+        (b"abc\x84\x39\x84\x39\xc1\xc4", "replace", "abc\ufffd9\ufffd9\u804a"),
         ("\u30fb", "strict", b"\x819\xa79"),
+        (b"abc\x84\x32\x80\x80def", "replace", 'abc\ufffd2\ufffd\ufffddef'),
+        (b"abc\x81\x30\x81\x30def", "strict", 'abc\x80def'),
+        (b"abc\x86\x30\x81\x30def", "replace", 'abc\ufffd0\ufffd0def'),
     )
     has_iso10646 = True
 
@@ -74,9 +77,11 @@ class Test_HZ(test_multibytecodec_support.TestBase, unittest.TestCase):
          '\u5df1\u6240\u4e0d\u6b32\uff0c\u52ff\u65bd\u65bc\u4eba\u3002'
          'Bye.\n'),
         # invalid bytes
-        (b'ab~cd', 'replace', 'ab\uFFFDd'),
+        (b'ab~cd', 'replace', 'ab\uFFFDcd'),
         (b'ab\xffcd', 'replace', 'ab\uFFFDcd'),
         (b'ab~{\x81\x81\x41\x44~}cd', 'replace', 'ab\uFFFD\uFFFD\u804Acd'),
+        (b'ab~{\x41\x44~}cd', 'replace', 'ab\u804Acd'),
+        (b"ab~{\x79\x79\x41\x44~}cd", "replace", "ab\ufffd\ufffd\u804acd"),
     )
 
 def test_main():
index ccdc0b4c554460ee3ea7ace67606b379ce74f1b3..520df4349464858cb3bf2eed7330e1a9203f0cb9 100644 (file)
@@ -15,8 +15,8 @@ class Test_Big5HKSCS(test_multibytecodec_support.TestBase, unittest.TestCase):
         # invalid bytes
         (b"abc\x80\x80\xc1\xc4", "strict",  None),
         (b"abc\xc8", "strict",  None),
-        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u8b10"),
-        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u8b10\ufffd"),
+        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u8b10"),
+        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u8b10\ufffd"),
         (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\u8b10"),
     )
 
index f56a3738961a6c8701f43d0773ecce7564ce75d6..87e4812482d25702e7eeecd5e2a4033e66c4e05c 100644 (file)
@@ -15,50 +15,57 @@ class Test_CP932(test_multibytecodec_support.TestBase, unittest.TestCase):
         # invalid bytes
         (b"abc\x81\x00\x81\x00\x82\x84", "strict",  None),
         (b"abc\xf8", "strict",  None),
-        (b"abc\x81\x00\x82\x84", "replace", "abc\ufffd\uff44"),
-        (b"abc\x81\x00\x82\x84\x88", "replace", "abc\ufffd\uff44\ufffd"),
-        (b"abc\x81\x00\x82\x84", "ignore",  "abc\uff44"),
+        (b"abc\x81\x00\x82\x84", "replace", "abc\ufffd\x00\uff44"),
+        (b"abc\x81\x00\x82\x84\x88", "replace", "abc\ufffd\x00\uff44\ufffd"),
+        (b"abc\x81\x00\x82\x84", "ignore",  "abc\x00\uff44"),
+        (b"ab\xEBxy", "replace", "ab\uFFFDxy"),
+        (b"ab\xF0\x39xy", "replace", "ab\uFFFD9xy"),
+        (b"ab\xEA\xF0xy", "replace", 'ab\ufffd\ue038y'),
         # sjis vs cp932
         (b"\\\x7e", "replace", "\\\x7e"),
         (b"\x81\x5f\x81\x61\x81\x7c", "replace", "\uff3c\u2225\uff0d"),
     )
 
+euc_commontests = (
+    # invalid bytes
+    (b"abc\x80\x80\xc1\xc4", "strict",  None),
+    (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u7956"),
+    (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u7956\ufffd"),
+    (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\u7956"),
+    (b"abc\xc8", "strict",  None),
+    (b"abc\x8f\x83\x83", "replace", "abc\ufffd\ufffd\ufffd"),
+    (b"\x82\xFCxy", "replace", "\ufffd\ufffdxy"),
+    (b"\xc1\x64", "strict", None),
+    (b"\xa1\xc0", "strict", "\uff3c"),
+    (b"\xa1\xc0\\", "strict", "\uff3c\\"),
+    (b"\x8eXY", "replace", "\ufffdXY"),
+)
+
+class Test_EUC_JIS_2004(test_multibytecodec_support.TestBase,
+                        unittest.TestCase):
+    encoding = 'euc_jis_2004'
+    tstring = test_multibytecodec_support.load_teststring('euc_jisx0213')
+    codectests = euc_commontests
+    xmlcharnametest = (
+        "\xab\u211c\xbb = \u2329\u1234\u232a",
+        b"\xa9\xa8&real;\xa9\xb2 = &lang;&#4660;&rang;"
+    )
+
 class Test_EUC_JISX0213(test_multibytecodec_support.TestBase,
                         unittest.TestCase):
     encoding = 'euc_jisx0213'
     tstring = test_multibytecodec_support.load_teststring('euc_jisx0213')
-    codectests = (
-        # invalid bytes
-        (b"abc\x80\x80\xc1\xc4", "strict",  None),
-        (b"abc\xc8", "strict",  None),
-        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u7956"),
-        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u7956\ufffd"),
-        (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\u7956"),
-        (b"abc\x8f\x83\x83", "replace", "abc\ufffd"),
-        (b"\xc1\x64", "strict", None),
-        (b"\xa1\xc0", "strict", "\uff3c"),
-    )
+    codectests = euc_commontests
     xmlcharnametest = (
         "\xab\u211c\xbb = \u2329\u1234\u232a",
         b"\xa9\xa8&real;\xa9\xb2 = &lang;&#4660;&rang;"
     )
 
-eucjp_commontests = (
-    (b"abc\x80\x80\xc1\xc4", "strict",  None),
-    (b"abc\xc8", "strict",  None),
-    (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u7956"),
-    (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u7956\ufffd"),
-    (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\u7956"),
-    (b"abc\x8f\x83\x83", "replace", "abc\ufffd"),
-    (b"\xc1\x64", "strict", None),
-)
-
 class Test_EUC_JP_COMPAT(test_multibytecodec_support.TestBase,
                          unittest.TestCase):
     encoding = 'euc_jp'
     tstring = test_multibytecodec_support.load_teststring('euc_jp')
-    codectests = eucjp_commontests + (
-        (b"\xa1\xc0\\", "strict", "\uff3c\\"),
+    codectests = euc_commontests + (
         ("\xa5", "strict", b"\x5c"),
         ("\u203e", "strict", b"\x7e"),
     )
@@ -66,8 +73,6 @@ class Test_EUC_JP_COMPAT(test_multibytecodec_support.TestBase,
 shiftjis_commonenctests = (
     (b"abc\x80\x80\x82\x84", "strict",  None),
     (b"abc\xf8", "strict",  None),
-    (b"abc\x80\x80\x82\x84", "replace", "abc\ufffd\uff44"),
-    (b"abc\x80\x80\x82\x84\x88", "replace", "abc\ufffd\uff44\ufffd"),
     (b"abc\x80\x80\x82\x84def", "ignore",  "abc\uff44def"),
 )
 
@@ -75,20 +80,41 @@ class Test_SJIS_COMPAT(test_multibytecodec_support.TestBase, unittest.TestCase):
     encoding = 'shift_jis'
     tstring = test_multibytecodec_support.load_teststring('shift_jis')
     codectests = shiftjis_commonenctests + (
+        (b"abc\x80\x80\x82\x84", "replace", "abc\ufffd\ufffd\uff44"),
+        (b"abc\x80\x80\x82\x84\x88", "replace", "abc\ufffd\ufffd\uff44\ufffd"),
+
         (b"\\\x7e", "strict", "\\\x7e"),
         (b"\x81\x5f\x81\x61\x81\x7c", "strict", "\uff3c\u2016\u2212"),
+        (b"abc\x81\x39", "replace",  "abc\ufffd9"),
+        (b"abc\xEA\xFC", "replace",  "abc\ufffd\ufffd"),
+        (b"abc\xFF\x58", "replace",  "abc\ufffdX"),
+    )
+
+class Test_SJIS_2004(test_multibytecodec_support.TestBase, unittest.TestCase):
+    encoding = 'shift_jis_2004'
+    tstring = test_multibytecodec_support.load_teststring('shift_jis')
+    codectests = shiftjis_commonenctests + (
+        (b"\\\x7e", "strict", "\xa5\u203e"),
+        (b"\x81\x5f\x81\x61\x81\x7c", "strict", "\\\u2016\u2212"),
+        (b"abc\xEA\xFC", "strict",  "abc\u64bf"),
+        (b"\x81\x39xy", "replace",  "\ufffd9xy"),
+        (b"\xFF\x58xy", "replace",  "\ufffdXxy"),
+        (b"\x80\x80\x82\x84xy", "replace", "\ufffd\ufffd\uff44xy"),
+        (b"\x80\x80\x82\x84\x88xy", "replace", "\ufffd\ufffd\uff44\u5864y"),
+        (b"\xFC\xFBxy", "replace", '\ufffd\u95b4y'),
+    )
+    xmlcharnametest = (
+        "\xab\u211c\xbb = \u2329\u1234\u232a",
+        b"\x85G&real;\x85Q = &lang;&#4660;&rang;"
     )
 
 class Test_SJISX0213(test_multibytecodec_support.TestBase, unittest.TestCase):
     encoding = 'shift_jisx0213'
     tstring = test_multibytecodec_support.load_teststring('shift_jisx0213')
-    codectests = (
-        # invalid bytes
-        (b"abc\x80\x80\x82\x84", "strict",  None),
-        (b"abc\xf8", "strict",  None),
-        (b"abc\x80\x80\x82\x84", "replace", "abc\ufffd\uff44"),
-        (b"abc\x80\x80\x82\x84\x88", "replace", "abc\ufffd\uff44\ufffd"),
-        (b"abc\x80\x80\x82\x84def", "ignore",  "abc\uff44def"),
+    codectests = shiftjis_commonenctests + (
+        (b"abc\x80\x80\x82\x84", "replace", "abc\ufffd\ufffd\uff44"),
+        (b"abc\x80\x80\x82\x84\x88", "replace", "abc\ufffd\ufffd\uff44\ufffd"),
+
         # sjis vs cp932
         (b"\\\x7e", "replace", "\xa5\u203e"),
         (b"\x81\x5f\x81\x61\x81\x7c", "replace", "\x5c\u2016\u2212"),
index de4da7f5b6df8a3a587c927e7984085198abfee3..4997e8349b0b313a6ae2bfe26f277c2436d4f99f 100644 (file)
@@ -15,8 +15,8 @@ class Test_CP949(test_multibytecodec_support.TestBase, unittest.TestCase):
         # invalid bytes
         (b"abc\x80\x80\xc1\xc4", "strict",  None),
         (b"abc\xc8", "strict",  None),
-        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\uc894"),
-        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\uc894\ufffd"),
+        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\uc894"),
+        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\uc894\ufffd"),
         (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\uc894"),
     )
 
@@ -27,8 +27,8 @@ class Test_EUCKR(test_multibytecodec_support.TestBase, unittest.TestCase):
         # invalid bytes
         (b"abc\x80\x80\xc1\xc4", "strict",  None),
         (b"abc\xc8", "strict",  None),
-        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\uc894"),
-        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\uc894\ufffd"),
+        (b"abc\x80\x80\xc1\xc4", "replace", 'abc\ufffd\ufffd\uc894'),
+        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\uc894\ufffd"),
         (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\uc894"),
 
         # composed make-up sequence errors
@@ -40,13 +40,14 @@ class Test_EUCKR(test_multibytecodec_support.TestBase, unittest.TestCase):
         (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4", "strict", None),
         (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xd4", "strict", "\uc4d4"),
         (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xd4x", "strict", "\uc4d4x"),
-        (b"a\xa4\xd4\xa4\xb6\xa4", "replace", "a\ufffd"),
+        (b"a\xa4\xd4\xa4\xb6\xa4", "replace", 'a\ufffd'),
         (b"\xa4\xd4\xa3\xb6\xa4\xd0\xa4\xd4", "strict", None),
         (b"\xa4\xd4\xa4\xb6\xa3\xd0\xa4\xd4", "strict", None),
         (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa3\xd4", "strict", None),
-        (b"\xa4\xd4\xa4\xff\xa4\xd0\xa4\xd4", "replace", "\ufffd"),
-        (b"\xa4\xd4\xa4\xb6\xa4\xff\xa4\xd4", "replace", "\ufffd"),
-        (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xff", "replace", "\ufffd"),
+        (b"\xa4\xd4\xa4\xff\xa4\xd0\xa4\xd4", "replace", '\ufffd\u6e21\ufffd\u3160\ufffd'),
+        (b"\xa4\xd4\xa4\xb6\xa4\xff\xa4\xd4", "replace", '\ufffd\u6e21\ub544\ufffd\ufffd'),
+        (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xff", "replace", '\ufffd\u6e21\ub544\u572d\ufffd'),
+        (b"\xa4\xd4\xff\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xd4", "replace", '\ufffd\ufffd\ufffd\uc4d4'),
         (b"\xc1\xc4", "strict", "\uc894"),
     )
 
@@ -57,9 +58,13 @@ class Test_JOHAB(test_multibytecodec_support.TestBase, unittest.TestCase):
         # invalid bytes
         (b"abc\x80\x80\xc1\xc4", "strict",  None),
         (b"abc\xc8", "strict",  None),
-        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ucd27"),
-        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ucd27\ufffd"),
+        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\ucd27"),
+        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\ucd27\ufffd"),
         (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\ucd27"),
+        (b"\xD8abc", "replace",  "\uFFFDabc"),
+        (b"\xD8\xFFabc", "replace",  "\uFFFD\uFFFDabc"),
+        (b"\x84bxy", "replace",  "\uFFFDbxy"),
+        (b"\x8CBxy", "replace",  "\uFFFDBxy"),
     )
 
 def test_main():
index 12d3c9fa041e963a10d42ab23b28156747cbbd48..f2f3c1802d516f4c07fe7e8740a9d658e825c024 100644 (file)
@@ -15,8 +15,8 @@ class Test_Big5(test_multibytecodec_support.TestBase, unittest.TestCase):
         # invalid bytes
         (b"abc\x80\x80\xc1\xc4", "strict",  None),
         (b"abc\xc8", "strict",  None),
-        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u8b10"),
-        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u8b10\ufffd"),
+        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u8b10"),
+        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u8b10\ufffd"),
         (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\u8b10"),
     )
 
index 6db5091fc3ae42893226dbce442b0d99e7cc9a94..412b9de8b25a059162d97cc09897636cd2784d39 100644 (file)
@@ -23,6 +23,9 @@ class TestCP950Map(test_multibytecodec_support.TestBase_Mapping,
         (b'\xa2\xcc', '\u5341'),
         (b'\xa2\xce', '\u5345'),
     ]
+    codectests = (
+        (b"\xFFxy", "replace",  "\ufffdxy"),
+    )
 
 def test_main():
     support.run_unittest(__name__)
index f742f8afa2e5ec6c6535628eb87904bcb41f39a1..404185d73063707e8229ad5c2a9a03e7a3c0f4f1 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -219,6 +219,10 @@ Core and Builtins
 Library
 -------
 
+- Issue #12016: Multibyte CJK decoders now resynchronize faster. They only
+  ignore the first byte of an invalid byte sequence. For example,
+  b'\xff\n'.decode('gb2312', 'replace') gives '\ufffd\n' instead of '\ufffd'.
+
 - Issue #12459: time.sleep() now raises a ValueError if the sleep length is
   negative, instead of an infinite sleep on Windows or raising an IOError on
   Linux for example, to have the same behaviour on all platforms.
index ab4e6593322e9992d34566d39a63553bb6bd9754..9e9e96c4d1c16345108f01e709208c112a929850 100644 (file)
@@ -85,7 +85,7 @@ DECODER(gb2312)
         TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
             NEXT(2, 1)
         }
-        else return 2;
+        else return 1;
     }
 
     return 0;
@@ -141,7 +141,7 @@ DECODER(gbk)
         REQUIRE_INBUF(2)
 
         GBK_DECODE(c, IN2, **outbuf)
-        else return 2;
+        else return 1;
 
         NEXT(2, 1)
     }
@@ -267,7 +267,7 @@ DECODER(gb18030)
             c3 = IN3;
             c4 = IN4;
             if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39)
-                return 4;
+                return 1;
             c -= 0x81;  c2 -= 0x30;
             c3 -= 0x81; c4 -= 0x30;
 
@@ -292,12 +292,12 @@ DECODER(gb18030)
                     continue;
                 }
             }
-            return 4;
+            return 1;
         }
 
         GBK_DECODE(c, c2, **outbuf)
         else TRYMAP_DEC(gb18030ext, **outbuf, c, c2);
-        else return 2;
+        else return 1;
 
         NEXT(2, 1)
     }
@@ -400,7 +400,7 @@ DECODER(hz)
             else if (c2 == '\n')
                 ; /* line-continuation */
             else
-                return 2;
+                return 1;
             NEXT(2, 0);
             continue;
         }
@@ -419,7 +419,7 @@ DECODER(hz)
                 NEXT(2, 1)
             }
             else
-                return 2;
+                return 1;
         }
     }
 
index 558a42f89c8ceb795a08e4659f2d5294c48ce5fb..d3ad04b6dd44507643c55927a5c0e8b143024844 100644 (file)
@@ -161,7 +161,7 @@ DECODER(big5hkscs)
         case 0x8864: WRITE2(0x00ca, 0x030c); break;
         case 0x88a3: WRITE2(0x00ea, 0x0304); break;
         case 0x88a5: WRITE2(0x00ea, 0x030c); break;
-        default: return 2;
+        default: return 1;
         }
 
         NEXT(2, 2) /* all decoded codepoints are pairs, above. */
index a05e01b32e5a41281ea1e87379d4e406f82fd588..a500696e9312f201437c42ed03a0ba0d1c2c5efd 100644 (file)
@@ -112,7 +112,7 @@ DECODER(cp932)
         TRYMAP_DEC(cp932ext, **outbuf, c, c2);
         else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)){
             if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
-                return 2;
+                return 1;
 
             c = (c < 0xe0 ? c - 0x81 : c - 0xc1);
             c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
@@ -120,7 +120,7 @@ DECODER(cp932)
             c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21;
 
             TRYMAP_DEC(jisx0208, **outbuf, c, c2);
-            else return 2;
+            else return 1;
         }
         else if (c >= 0xf0 && c <= 0xf9) {
             if ((c2 >= 0x40 && c2 <= 0x7e) ||
@@ -128,10 +128,10 @@ DECODER(cp932)
                 OUT1(0xe000 + 188 * (c - 0xf0) +
                      (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41))
             else
-                return 2;
+                return 1;
         }
         else
-            return 2;
+            return 1;
 
         NEXT(2, 1)
     }
@@ -256,7 +256,7 @@ DECODER(euc_jis_2004)
                 NEXT(2, 1)
             }
             else
-                return 2;
+                return 1;
         }
         else if (c == 0x8f) {
             unsigned char c2, c3;
@@ -274,7 +274,7 @@ DECODER(euc_jis_2004)
                 continue;
             }
             else TRYMAP_DEC(jisx0212, **outbuf, c2, c3) ;
-            else return 3;
+            else return 1;
             NEXT(3, 1)
         }
         else {
@@ -300,7 +300,7 @@ DECODER(euc_jis_2004)
                 NEXT(2, 2)
                 continue;
             }
-            else return 2;
+            else return 1;
             NEXT(2, 1)
         }
     }
@@ -388,7 +388,7 @@ DECODER(euc_jp)
                 NEXT(2, 1)
             }
             else
-                return 2;
+                return 1;
         }
         else if (c == 0x8f) {
             unsigned char c2, c3;
@@ -401,7 +401,7 @@ DECODER(euc_jp)
                 NEXT(3, 1)
             }
             else
-                return 3;
+                return 1;
         }
         else {
             unsigned char c2;
@@ -417,7 +417,7 @@ DECODER(euc_jp)
 #endif
                 TRYMAP_DEC(jisx0208, **outbuf,
                            c ^ 0x80, c2 ^ 0x80) ;
-            else return 2;
+            else return 1;
             NEXT(2, 1)
         }
     }
@@ -502,7 +502,7 @@ DECODER(shift_jis)
             REQUIRE_INBUF(2)
             c2 = IN2;
             if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
-                return 2;
+                return 1;
 
             c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1);
             c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
@@ -522,10 +522,10 @@ DECODER(shift_jis)
                 continue;
             }
             else
-                return 2;
+                return 1;
         }
         else
-            return 2;
+            return 1;
 
         NEXT(1, 1) /* JIS X 0201 */
     }
@@ -645,7 +645,7 @@ DECODER(shift_jis_2004)
             REQUIRE_INBUF(2)
             c2 = IN2;
             if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
-                return 2;
+                return 1;
 
             c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1);
             c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
@@ -671,7 +671,7 @@ DECODER(shift_jis_2004)
                     NEXT_OUT(2)
                 }
                 else
-                    return 2;
+                    return 1;
                 NEXT_IN(2)
             }
             else { /* Plane 2 */
@@ -689,13 +689,13 @@ DECODER(shift_jis_2004)
                     continue;
                 }
                 else
-                    return 2;
+                    return 1;
                 NEXT(2, 1)
             }
             continue;
         }
         else
-            return 2;
+            return 1;
 
         NEXT(1, 1) /* JIS X 0201 */
     }
index 9272e363e1f71a15ddcfd6eba6e94895475602e0..f5697dd2f698050dec0be9db56010bb7f7694818 100644 (file)
@@ -123,7 +123,7 @@ DECODER(euc_kr)
             if ((*inbuf)[2] != EUCKR_JAMO_FIRSTBYTE ||
                 (*inbuf)[4] != EUCKR_JAMO_FIRSTBYTE ||
                 (*inbuf)[6] != EUCKR_JAMO_FIRSTBYTE)
-                return 8;
+                return 1;
 
             c = (*inbuf)[3];
             if (0xa1 <= c && c <= 0xbe)
@@ -143,7 +143,7 @@ DECODER(euc_kr)
                 jong = NONE;
 
             if (cho == NONE || jung == NONE || jong == NONE)
-                return 8;
+                return 1;
 
             OUT1(0xac00 + cho*588 + jung*28 + jong);
             NEXT(8, 1)
@@ -152,7 +152,7 @@ DECODER(euc_kr)
             NEXT(2, 1)
         }
         else
-            return 2;
+            return 1;
     }
 
     return 0;
@@ -208,7 +208,7 @@ DECODER(cp949)
         REQUIRE_INBUF(2)
         TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80);
         else TRYMAP_DEC(cp949ext, **outbuf, c, IN2);
-        else return 2;
+        else return 1;
 
         NEXT(2, 1)
     }
@@ -375,7 +375,7 @@ DECODER(johab)
             i_jong = johabidx_jongseong[c_jong];
 
             if (i_cho == NONE || i_jung == NONE || i_jong == NONE)
-                return 2;
+                return 1;
 
             /* we don't use U+1100 hangul jamo yet. */
             if (i_cho == FILL) {
@@ -391,7 +391,7 @@ DECODER(johab)
                         OUT1(0x3100 |
                           johabjamo_jungseong[c_jung])
                     else
-                        return 2;
+                        return 1;
                 }
             } else {
                 if (i_jung == FILL) {
@@ -399,7 +399,7 @@ DECODER(johab)
                         OUT1(0x3100 |
                           johabjamo_choseong[c_cho])
                     else
-                        return 2;
+                        return 1;
                 }
                 else
                     OUT1(0xac00 +
@@ -414,7 +414,7 @@ DECODER(johab)
                 c2 < 0x31 || (c2 >= 0x80 && c2 < 0x91) ||
                 (c2 & 0x7f) == 0x7f ||
                 (c == 0xda && (c2 >= 0xa1 && c2 <= 0xd3)))
-                return 2;
+                return 1;
             else {
                 unsigned char t1, t2;
 
@@ -425,7 +425,7 @@ DECODER(johab)
                 t2 = (t2 < 0x5e ? t2 : t2 - 0x5e) + 0x21;
 
                 TRYMAP_DEC(ksx1001, **outbuf, t1, t2);
-                else return 2;
+                else return 1;
                 NEXT(2, 1)
             }
         }
index 38cf7239b4b81abfbd9916cf0d32628ef6762dce..916298d167400dc26709497c9cd69b9a34679ae1 100644 (file)
@@ -55,7 +55,7 @@ DECODER(big5)
         TRYMAP_DEC(big5, **outbuf, c, IN2) {
             NEXT(2, 1)
         }
-        else return 2;
+        else return 1;
     }
 
     return 0;
@@ -109,7 +109,7 @@ DECODER(cp950)
 
         TRYMAP_DEC(cp950ext, **outbuf, c, IN2);
         else TRYMAP_DEC(big5, **outbuf, c, IN2);
-        else return 2;
+        else return 1;
 
         NEXT(2, 1)
     }