Issue #16979: Fix error handling bugs in the unicode-escape-decode decoder.

author Serhiy Storchaka <storchaka@gmail.com>

Tue, 29 Jan 2013 08:28:07 +0000 (10:28 +0200)

committer Serhiy Storchaka <storchaka@gmail.com>

Tue, 29 Jan 2013 08:28:07 +0000 (10:28 +0200)
author Serhiy Storchaka <storchaka@gmail.com>
Tue, 29 Jan 2013 08:28:07 +0000 (10:28 +0200)
committer Serhiy Storchaka <storchaka@gmail.com>
Tue, 29 Jan 2013 08:28:07 +0000 (10:28 +0200)
diff --cc Lib/test/test_codeccallbacks.py
Simple merge
diff --cc Lib/test/test_codecs.py

index 67690b855f585e74b61e2f57fece9625efa8203f,e74038be4d85d47c1601b326c0b8db45e3dcd2a0..4c0c6debfaa2d579dd49f257d76b61512391947f
--- 1/Lib/test/test_codecs.py
--- 2/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@@ -1,26 -1,14 +1,31 @@@
- -from test import support
- -import unittest
+ +import _testcapi
   import codecs
+ +import io
   import locale
- -import sys, _testcapi, io
+ +import sys
+ +import unittest
+ +import warnings
+ +
+ +from test import support
+ +
+ +if sys.platform == 'win32':
+ +    VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
+ +else:
+ +    VISTA_OR_LATER = False
+ +
+ +try:
+ +    import ctypes
+ +except ImportError:
+ +    ctypes = None
+ +    SIZEOF_WCHAR_T = -1
+ +else:
+ +    SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
   
+ def coding_checker(self, coder):
+     def check(input, expect):
+         self.assertEqual(coder(input), (expect, len(input)))
+     return check
+ 
   class Queue(object):
       """
       queue: write bytes at one end, read bytes from the other end
@@@ -2003,12 -1851,85 +2008,91 @@@ class TypesTest(unittest.TestCase)
           self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
           self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
   
+ +        self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
+ +        self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
+ +
+ +        self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
+ +        self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
+ +
+ 
+ class UnicodeEscapeTest(unittest.TestCase):
+     def test_empty(self):
+         self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
+         self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
+ 
+     def test_raw_encode(self):
+         encode = codecs.unicode_escape_encode
+         for b in range(32, 127):
+             if b != b'\\'[0]:
+                 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
+ 
+     def test_raw_decode(self):
+         decode = codecs.unicode_escape_decode
+         for b in range(256):
+             if b != b'\\'[0]:
+                 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
+ 
+     def test_escape_encode(self):
+         encode = codecs.unicode_escape_encode
+         check = coding_checker(self, encode)
+         check('\t', br'\t')
+         check('\n', br'\n')
+         check('\r', br'\r')
+         check('\\', br'\\')
+         for b in range(32):
+             if chr(b) not in '\t\n\r':
+                 check(chr(b), ('\\x%02x' % b).encode())
+         for b in range(127, 256):
+             check(chr(b), ('\\x%02x' % b).encode())
+         check('\u20ac', br'\u20ac')
+         check('\U0001d120', br'\U0001d120')
+ 
+     def test_escape_decode(self):
+         decode = codecs.unicode_escape_decode
+         check = coding_checker(self, decode)
+         check(b"[\\\n]", "[]")
+         check(br'[\"]', '["]')
+         check(br"[\']", "[']")
+         check(br"[\\]", r"[\]")
+         check(br"[\a]", "[\x07]")
+         check(br"[\b]", "[\x08]")
+         check(br"[\t]", "[\x09]")
+         check(br"[\n]", "[\x0a]")
+         check(br"[\v]", "[\x0b]")
+         check(br"[\f]", "[\x0c]")
+         check(br"[\r]", "[\x0d]")
+         check(br"[\7]", "[\x07]")
+         check(br"[\8]", r"[\8]")
+         check(br"[\78]", "[\x078]")
+         check(br"[\41]", "[!]")
+         check(br"[\418]", "[!8]")
+         check(br"[\101]", "[A]")
+         check(br"[\1010]", "[A0]")
+         check(br"[\x41]", "[A]")
+         check(br"[\x410]", "[A0]")
+         check(br"\u20ac", "\u20ac")
+         check(br"\U0001d120", "\U0001d120")
+         for b in range(256):
+             if b not in b'\n"\'\\abtnvfr01234567xuUN':
+                 check(b'\\' + bytes([b]), '\\' + chr(b))
+ 
+     def test_decode_errors(self):
+         decode = codecs.unicode_escape_decode
+         for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
+             for i in range(d):
+                 self.assertRaises(UnicodeDecodeError, decode,
+                                   b"\\" + c + b"0"*i)
+                 self.assertRaises(UnicodeDecodeError, decode,
+                                   b"[\\" + c + b"0"*i + b"]")
+                 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
+                 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
+                 self.assertEqual(decode(data, "replace"),
+                                  ("[\ufffd]\ufffd", len(data)))
+         self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
+         self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
+         self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
+ 
+ 
   class SurrogateEscapeTest(unittest.TestCase):
   
       def test_utf8(self):
diff --cc Misc/NEWS

index b8de2f0086cfc9e6c7d12bfe1ba0d17ca9ac9fad,36edcba5318cc7666acea65fb8d021d650cf4683..f8860058de1c27d26f5b09f5c4f0d2d75076cc85
--- 1/Misc/NEWS
--- 2/Misc/NEWS
+++ b/Misc/NEWS
@@@ -162,9 -214,8 +162,11 @@@ Core and Builtin
   Library
   -------
   
+ - Issue #16979: Fix error handling bugs in the unicode-escape-decode decoder.
+ 
+ +- Issue #1602133: on Mac OS X a shared library build (``--enable-shared``)
+ +  now fills the ``os.environ`` variable correctly.
+ +
   - Issue #9290: In IDLE the sys.std* streams now implement io.TextIOBase
     interface and support all mandatory methods and properties.
   
diff --cc Objects/unicodeobject.c

index c96a91c3973218b3a30000e31e714eb725b1f1bb,3a288d845b8eaa385580ee6f53b84350caca1afa..b559cb1cbda3d1ed20f153bf9466ff2860749a04
--- 1/Objects/unicodeobject.c
--- 2/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@@ -5508,8 -3759,9 +5508,7 @@@ PyUnicode_DecodeUnicodeEscape(const cha
       const char *starts = s;
       Py_ssize_t startinpos;
       Py_ssize_t endinpos;
-     int j;
- -    Py_ssize_t outpos;
- -    PyUnicodeObject *v;
- -    Py_UNICODE *p;
+ +    PyObject *v;
       const char *end;
       char* message;
       Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
@@@ -5668,17 -3872,23 +5656,10 @@@
                   break;
           store:
               /* when we get here, chr is a 32-bit unicode character */
-             if (chr <= MAX_UNICODE) {
-                 WRITECHAR(chr);
- -            if (chr <= 0xffff)
- -                /* UCS-2 character */
- -                *p++ = (Py_UNICODE) chr;
- -            else if (chr <= 0x10ffff) {
- -                /* UCS-4 character. Either store directly, or as
- -                   surrogate pair. */
- -#ifdef Py_UNICODE_WIDE
- -                *p++ = chr;
- -#else
- -                chr -= 0x10000L;
- -                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
- -                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
- -#endif
--            } else {
-                 endinpos = s-starts;
-                 if (unicode_decode_call_errorhandler(
-                         errors, &errorHandler,
-                         "unicodeescape", "illegal Unicode character",
-                         &starts, &end, &startinpos, &endinpos, &exc, &s,
-                         &v, &i))
-                     goto onError;
-             }
- -                message = "illegal Unicode character";
++            message = "illegal Unicode character";
++            if (chr > MAX_UNICODE)
+                 goto error;
- -            }
++            WRITECHAR(chr);
               break;
   
               /* \N{name} */
@@@ -5719,26 -3920,28 +5693,30 @@@
               if (s > end) {
                   message = "\\ at end of string";
                   s--;
-                 endinpos = s-starts;
-                 if (unicode_decode_call_errorhandler(
-                         errors, &errorHandler,
-                         "unicodeescape", message,
-                         &starts, &end, &startinpos, &endinpos, &exc, &s,
-                         &v, &i))
-                     goto onError;
+                 goto error;
               }
               else {
- -                *p++ = '\\';
- -                *p++ = (unsigned char)s[-1];
+ +                WRITECHAR('\\');
+ +                WRITECHAR((unsigned char)s[-1]);
               }
               break;
           }
-       nextByte:
-         ;
+         continue;
+ 
+       error:
+         endinpos = s-starts;
- -        outpos = p-PyUnicode_AS_UNICODE(v);
+         if (unicode_decode_call_errorhandler(
+                 errors, &errorHandler,
+                 "unicodeescape", message,
+                 &starts, &end, &startinpos, &endinpos, &exc, &s,
- -                &v, &outpos, &p))
++                &v, &i))
+             goto onError;
++        len = PyUnicode_GET_LENGTH(v);
+         continue;
       }
- -    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
+ +#undef WRITECHAR
+ +
+ +    if (unicode_resize(&v, i) < 0)
           goto onError;
       Py_XDECREF(errorHandler);
       Py_XDECREF(exc);
author	Serhiy Storchaka <storchaka@gmail.com>
	Tue, 29 Jan 2013 08:28:07 +0000 (10:28 +0200)
committer	Serhiy Storchaka <storchaka@gmail.com>
	Tue, 29 Jan 2013 08:28:07 +0000 (10:28 +0200)
		1	2
Lib/test/test_codeccallbacks.py	patch \|	diff1 \|	diff2 \|	blob \| history
Lib/test/test_codecs.py	patch \|	diff1 \|	diff2 \|	blob \| history
Misc/NEWS	patch \|	diff1 \|	diff2 \|	blob \| history
Objects/unicodeobject.c	patch \|	diff1 \|	diff2 \|	blob \| history