]> granicus.if.org Git - python/commitdiff
Issue #16979: Fix error handling bugs in the unicode-escape-decode decoder.
authorSerhiy Storchaka <storchaka@gmail.com>
Tue, 29 Jan 2013 08:28:07 +0000 (10:28 +0200)
committerSerhiy Storchaka <storchaka@gmail.com>
Tue, 29 Jan 2013 08:28:07 +0000 (10:28 +0200)
1  2 
Lib/test/test_codeccallbacks.py
Lib/test/test_codecs.py
Misc/NEWS
Objects/unicodeobject.c

Simple merge
index 67690b855f585e74b61e2f57fece9625efa8203f,e74038be4d85d47c1601b326c0b8db45e3dcd2a0..4c0c6debfaa2d579dd49f257d76b61512391947f
@@@ -1,26 -1,14 +1,31 @@@
 -from test import support
 -import unittest
 +import _testcapi
  import codecs
 +import io
  import locale
 -import sys, _testcapi, io
 +import sys
 +import unittest
 +import warnings
 +
 +from test import support
 +
 +if sys.platform == 'win32':
 +    VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
 +else:
 +    VISTA_OR_LATER = False
 +
 +try:
 +    import ctypes
 +except ImportError:
 +    ctypes = None
 +    SIZEOF_WCHAR_T = -1
 +else:
 +    SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
  
+ def coding_checker(self, coder):
+     def check(input, expect):
+         self.assertEqual(coder(input), (expect, len(input)))
+     return check
  class Queue(object):
      """
      queue: write bytes at one end, read bytes from the other end
@@@ -2003,12 -1851,85 +2008,91 @@@ class TypesTest(unittest.TestCase)
          self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
          self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
  
 +        self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
 +        self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
 +
 +        self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
 +        self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
 +
+ class UnicodeEscapeTest(unittest.TestCase):
+     def test_empty(self):
+         self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
+         self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
+     def test_raw_encode(self):
+         encode = codecs.unicode_escape_encode
+         for b in range(32, 127):
+             if b != b'\\'[0]:
+                 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
+     def test_raw_decode(self):
+         decode = codecs.unicode_escape_decode
+         for b in range(256):
+             if b != b'\\'[0]:
+                 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
+     def test_escape_encode(self):
+         encode = codecs.unicode_escape_encode
+         check = coding_checker(self, encode)
+         check('\t', br'\t')
+         check('\n', br'\n')
+         check('\r', br'\r')
+         check('\\', br'\\')
+         for b in range(32):
+             if chr(b) not in '\t\n\r':
+                 check(chr(b), ('\\x%02x' % b).encode())
+         for b in range(127, 256):
+             check(chr(b), ('\\x%02x' % b).encode())
+         check('\u20ac', br'\u20ac')
+         check('\U0001d120', br'\U0001d120')
+     def test_escape_decode(self):
+         decode = codecs.unicode_escape_decode
+         check = coding_checker(self, decode)
+         check(b"[\\\n]", "[]")
+         check(br'[\"]', '["]')
+         check(br"[\']", "[']")
+         check(br"[\\]", r"[\]")
+         check(br"[\a]", "[\x07]")
+         check(br"[\b]", "[\x08]")
+         check(br"[\t]", "[\x09]")
+         check(br"[\n]", "[\x0a]")
+         check(br"[\v]", "[\x0b]")
+         check(br"[\f]", "[\x0c]")
+         check(br"[\r]", "[\x0d]")
+         check(br"[\7]", "[\x07]")
+         check(br"[\8]", r"[\8]")
+         check(br"[\78]", "[\x078]")
+         check(br"[\41]", "[!]")
+         check(br"[\418]", "[!8]")
+         check(br"[\101]", "[A]")
+         check(br"[\1010]", "[A0]")
+         check(br"[\x41]", "[A]")
+         check(br"[\x410]", "[A0]")
+         check(br"\u20ac", "\u20ac")
+         check(br"\U0001d120", "\U0001d120")
+         for b in range(256):
+             if b not in b'\n"\'\\abtnvfr01234567xuUN':
+                 check(b'\\' + bytes([b]), '\\' + chr(b))
+     def test_decode_errors(self):
+         decode = codecs.unicode_escape_decode
+         for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
+             for i in range(d):
+                 self.assertRaises(UnicodeDecodeError, decode,
+                                   b"\\" + c + b"0"*i)
+                 self.assertRaises(UnicodeDecodeError, decode,
+                                   b"[\\" + c + b"0"*i + b"]")
+                 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
+                 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
+                 self.assertEqual(decode(data, "replace"),
+                                  ("[\ufffd]\ufffd", len(data)))
+         self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
+         self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
+         self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
  class SurrogateEscapeTest(unittest.TestCase):
  
      def test_utf8(self):
diff --cc Misc/NEWS
index b8de2f0086cfc9e6c7d12bfe1ba0d17ca9ac9fad,36edcba5318cc7666acea65fb8d021d650cf4683..f8860058de1c27d26f5b09f5c4f0d2d75076cc85
+++ b/Misc/NEWS
@@@ -162,9 -214,8 +162,11 @@@ Core and Builtin
  Library
  -------
  
+ - Issue #16979: Fix error handling bugs in the unicode-escape-decode decoder.
 +- Issue #1602133: on Mac OS X a shared library build (``--enable-shared``)
 +  now fills the ``os.environ`` variable correctly.
 +
  - Issue #9290: In IDLE the sys.std* streams now implement io.TextIOBase
    interface and support all mandatory methods and properties.
  
index c96a91c3973218b3a30000e31e714eb725b1f1bb,3a288d845b8eaa385580ee6f53b84350caca1afa..b559cb1cbda3d1ed20f153bf9466ff2860749a04
@@@ -5508,8 -3759,9 +5508,7 @@@ PyUnicode_DecodeUnicodeEscape(const cha
      const char *starts = s;
      Py_ssize_t startinpos;
      Py_ssize_t endinpos;
-     int j;
 -    Py_ssize_t outpos;
 -    PyUnicodeObject *v;
 -    Py_UNICODE *p;
 +    PyObject *v;
      const char *end;
      char* message;
      Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
                  break;
          store:
              /* when we get here, chr is a 32-bit unicode character */
-             if (chr <= MAX_UNICODE) {
-                 WRITECHAR(chr);
 -            if (chr <= 0xffff)
 -                /* UCS-2 character */
 -                *p++ = (Py_UNICODE) chr;
 -            else if (chr <= 0x10ffff) {
 -                /* UCS-4 character. Either store directly, or as
 -                   surrogate pair. */
 -#ifdef Py_UNICODE_WIDE
 -                *p++ = chr;
 -#else
 -                chr -= 0x10000L;
 -                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
 -                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
 -#endif
--            } else {
-                 endinpos = s-starts;
-                 if (unicode_decode_call_errorhandler(
-                         errors, &errorHandler,
-                         "unicodeescape", "illegal Unicode character",
-                         &starts, &end, &startinpos, &endinpos, &exc, &s,
-                         &v, &i))
-                     goto onError;
-             }
 -                message = "illegal Unicode character";
++            message = "illegal Unicode character";
++            if (chr > MAX_UNICODE)
+                 goto error;
 -            }
++            WRITECHAR(chr);
              break;
  
              /* \N{name} */
              if (s > end) {
                  message = "\\ at end of string";
                  s--;
-                 endinpos = s-starts;
-                 if (unicode_decode_call_errorhandler(
-                         errors, &errorHandler,
-                         "unicodeescape", message,
-                         &starts, &end, &startinpos, &endinpos, &exc, &s,
-                         &v, &i))
-                     goto onError;
+                 goto error;
              }
              else {
 -                *p++ = '\\';
 -                *p++ = (unsigned char)s[-1];
 +                WRITECHAR('\\');
 +                WRITECHAR((unsigned char)s[-1]);
              }
              break;
          }
-       nextByte:
-         ;
+         continue;
+       error:
+         endinpos = s-starts;
 -        outpos = p-PyUnicode_AS_UNICODE(v);
+         if (unicode_decode_call_errorhandler(
+                 errors, &errorHandler,
+                 "unicodeescape", message,
+                 &starts, &end, &startinpos, &endinpos, &exc, &s,
 -                &v, &outpos, &p))
++                &v, &i))
+             goto onError;
++        len = PyUnicode_GET_LENGTH(v);
+         continue;
      }
 -    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
 +#undef WRITECHAR
 +
 +    if (unicode_resize(&v, i) < 0)
          goto onError;
      Py_XDECREF(errorHandler);
      Py_XDECREF(exc);