#7649: "u'%c' % char" now behaves like "u'%s' % char" and raises a UnicodeDecodeError...

author Ezio Melotti <ezio.melotti@gmail.com>

Thu, 25 Feb 2010 17:36:04 +0000 (17:36 +0000)

committer Ezio Melotti <ezio.melotti@gmail.com>

Thu, 25 Feb 2010 17:36:04 +0000 (17:36 +0000)
author Ezio Melotti <ezio.melotti@gmail.com>
Thu, 25 Feb 2010 17:36:04 +0000 (17:36 +0000)
committer Ezio Melotti <ezio.melotti@gmail.com>
Thu, 25 Feb 2010 17:36:04 +0000 (17:36 +0000)
diff --git a/Misc/NEWS b/Misc/NEWS

index ccea907f0e3a63125f60f7a35dfa4655e530c9da..97ddc59a3476bab7e3980bf57ab07fff9f5e9e68 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -16,8 +16,9 @@ Core and Builtins
    UnicodeEncodeError, UnicodeDecodeError, and UnicodeTranslateError to
    strings.
  
-- Issue #7649: Fix u'%c' % char for character in range 0x80..0xFF, raise an
-  UnicodeDecodeError.
+- Issue #7649: "u'%c' % char" now behaves like "u'%s' % char" and raises a
+  UnicodeDecodeError if 'char' is a byte string that can't be decoded using
+  the default encoding.
  
  - Issue #6902: Fix problem with built-in types format incorrectly with
    0 padding.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index d80ff714ae240bec8d4c403c94daaee4491f81c4..113a460e27917d14ebc2029cdf03a9898211dda7 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -8170,7 +8170,8 @@ formatchar(Py_UNICODE *buf,
             size_t buflen,
             PyObject *v)
  {
-    PyObject *s;
+    PyObject *unistr;
+    char *str;
      /* presume that the buffer is at least 2 characters long */
      if (PyUnicode_Check(v)) {
          if (PyUnicode_GET_SIZE(v) != 1)
@@ -8181,14 +8182,22 @@ formatchar(Py_UNICODE *buf,
      else if (PyString_Check(v)) {
          if (PyString_GET_SIZE(v) != 1)
              goto onError;
-        /* #7649: if the char is a non-ascii (i.e. in range(0x80,0x100)) byte
-           string, "u'%c' % char" should fail with a UnicodeDecodeError */
-        s = PyUnicode_FromStringAndSize(PyString_AS_STRING(v), 1);
-        /* if the char is not decodable return -1 */
-        if (s == NULL)
-            return -1;
-        buf[0] = PyUnicode_AS_UNICODE(s)[0];
-        Py_DECREF(s);
+        /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
+           with a UnicodeDecodeError if 'char' is not decodable with the
+           default encoding (usually ASCII, but it might be something else) */
+        str = PyString_AS_STRING(v);
+        if ((unsigned char)str[0] > 0x7F) {
+            /* the char is not ASCII; try to decode the string using the
+               default encoding and return -1 to let the UnicodeDecodeError
+               be raised if the string can't be decoded */
+            unistr = PyUnicode_Decode(str, 1, NULL, "strict");
+            if (unistr == NULL)
+                return -1;
+            buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
+            Py_DECREF(unistr);
+        }
+        else
+            buf[0] = (Py_UNICODE)str[0];
      }
  
      else {
author	Ezio Melotti <ezio.melotti@gmail.com>
	Thu, 25 Feb 2010 17:36:04 +0000 (17:36 +0000)
committer	Ezio Melotti <ezio.melotti@gmail.com>
	Thu, 25 Feb 2010 17:36:04 +0000 (17:36 +0000)
Misc/NEWS		patch \| blob \| history
Objects/unicodeobject.c		patch \| blob \| history