]> granicus.if.org Git - python/commitdiff
Add 'U'/'U#' format characters to Py_BuildValue (and thus
authorWalter Dörwald <walter@livinglogic.de>
Fri, 18 May 2007 16:29:38 +0000 (16:29 +0000)
committerWalter Dörwald <walter@livinglogic.de>
Fri, 18 May 2007 16:29:38 +0000 (16:29 +0000)
to PyObject_CallFunction()) that take a char * (and a size
in the case of 'U#') and create a unicode object out of it.

Add functions PyUnicode_FromFormat() and PyUnicode_FromFormatV()
that work similar to PyString_FromFormat(), but create a unicode
object (also a %U format character has been added, that takes
a PyObject *, which must point to a unicode object).

Change the encoding and reason attributes of UnicodeEncodeError,
UnicodeDecodeError and UnicodeTranslateError to be unicode
objects.

Doc/api/utilities.tex
Include/unicodeobject.h
Lib/test/test_codeccallbacks.py
Objects/exceptions.c
Objects/unicodeobject.c
Python/modsupport.c

index fb9c90996998c7c4a1485183762bef5693a0f462..968ce4fead5f0c94abb2f52442aedb31f4e36f6e 100644 (file)
@@ -848,6 +848,15 @@ PyArg_ParseTuple(args, "O|O:ref", &object, &callback)
     to a Python Unicode object.   If the Unicode buffer pointer
     is \NULL, the length is ignored and \code{None} is returned.
 
+    \item[\samp{U} (string) {[char *]}]
+    Convert a null-terminated C string to a Python unicode object.
+    If the C string pointer is \NULL, \code{None} is used.
+
+    \item[\samp{U\#} (string) {[char *, int]}]
+    Convert a C string and its length to a Python unicode object.
+    If the C string pointer is \NULL, the length is ignored and \code{None}
+    is returned.
+
     \item[\samp{i} (integer) {[int]}]
     Convert a plain C \ctype{int} to a Python integer object.
 
index 9d0cabf6d3fc513ec9ff8e3827a7917429a92127..2e27d7486ff4908ac9a8152dd5c13ae8124700cb 100644 (file)
@@ -173,7 +173,9 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal
 # define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode
 # define PyUnicode_FromString PyUnicodeUCS2_FromString
-# define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar
+# define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize
+# define PyUnicode_FromFormatV PyUnicodeUCS2_FromFormatV
+# define PyUnicode_FromFormat PyUnicodeUCS2_FromFormat
 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
 # define PyUnicode_GetMax PyUnicodeUCS2_GetMax
 # define PyUnicode_GetSize PyUnicodeUCS2_GetSize
@@ -252,6 +254,9 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal
 # define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode
 # define PyUnicode_FromString PyUnicodeUCS4_FromString
+# define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize
+# define PyUnicode_FromFormatV PyUnicodeUCS4_FromFormatV
+# define PyUnicode_FromFormat PyUnicodeUCS4_FromFormat
 # define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar
 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
 # define PyUnicode_GetMax PyUnicodeUCS4_GetMax
@@ -429,6 +434,12 @@ PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
     Py_ssize_t size             /* size of buffer */
     );
 
+/* Similar to PyUnicode_FromUnicode(), but u points to Latin-1 encoded bytes */
+PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
+    const char *u,        /* char buffer */
+    Py_ssize_t size       /* size of buffer */
+    );
+
 /* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
    Latin-1 encoded bytes */
 PyAPI_FUNC(PyObject*) PyUnicode_FromString(
@@ -510,6 +521,9 @@ PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
     register PyObject *obj     /* Object */
     );
 
+PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(const char*, va_list);
+PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(const char*, ...);
+
 /* --- wchar_t support for platforms which support it --------------------- */
 
 #ifdef HAVE_WCHAR_H
index 911496dc1a0379ed5d7e0c97164a1ccc7461817d..4981d54af810305ad6129aa97894b14bb944ae2e 100644 (file)
@@ -21,43 +21,43 @@ class PosReturn:
 # A UnicodeEncodeError object with a bad start attribute
 class BadStartUnicodeEncodeError(UnicodeEncodeError):
     def __init__(self):
-        UnicodeEncodeError.__init__(self, str8("ascii"), "", 0, 1, str8("bad"))
+        UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad")
         self.start = []
 
 # A UnicodeEncodeError object with a bad object attribute
 class BadObjectUnicodeEncodeError(UnicodeEncodeError):
     def __init__(self):
-        UnicodeEncodeError.__init__(self, str8("ascii"), "", 0, 1, str8("bad"))
+        UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad")
         self.object = []
 
 # A UnicodeDecodeError object without an end attribute
 class NoEndUnicodeDecodeError(UnicodeDecodeError):
     def __init__(self):
-        UnicodeDecodeError.__init__(self, str8("ascii"), b"", 0, 1, str8("bad"))
+        UnicodeDecodeError.__init__(self, "ascii", b"", 0, 1, "bad")
         del self.end
 
 # A UnicodeDecodeError object with a bad object attribute
 class BadObjectUnicodeDecodeError(UnicodeDecodeError):
     def __init__(self):
-        UnicodeDecodeError.__init__(self, str8("ascii"), b"", 0, 1, str8("bad"))
+        UnicodeDecodeError.__init__(self, "ascii", b"", 0, 1, "bad")
         self.object = []
 
 # A UnicodeTranslateError object without a start attribute
 class NoStartUnicodeTranslateError(UnicodeTranslateError):
     def __init__(self):
-        UnicodeTranslateError.__init__(self, "", 0, 1, str8("bad"))
+        UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
         del self.start
 
 # A UnicodeTranslateError object without an end attribute
 class NoEndUnicodeTranslateError(UnicodeTranslateError):
     def __init__(self):
-        UnicodeTranslateError.__init__(self,  "", 0, 1, str8("bad"))
+        UnicodeTranslateError.__init__(self,  "", 0, 1, "bad")
         del self.end
 
 # A UnicodeTranslateError object without an object attribute
 class NoObjectUnicodeTranslateError(UnicodeTranslateError):
     def __init__(self):
-        UnicodeTranslateError.__init__(self, "", 0, 1, str8("bad"))
+        UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
         del self.object
 
 class CodecCallbackTest(unittest.TestCase):
@@ -328,73 +328,73 @@ class CodecCallbackTest(unittest.TestCase):
     def test_unicodeencodeerror(self):
         self.check_exceptionobjectargs(
             UnicodeEncodeError,
-            [str8("ascii"), "g\xfcrk", 1, 2, str8("ouch")],
+            ["ascii", "g\xfcrk", 1, 2, "ouch"],
             "'ascii' codec can't encode character u'\\xfc' in position 1: ouch"
         )
         self.check_exceptionobjectargs(
             UnicodeEncodeError,
-            [str8("ascii"), "g\xfcrk", 1, 4, str8("ouch")],
+            ["ascii", "g\xfcrk", 1, 4, "ouch"],
             "'ascii' codec can't encode characters in position 1-3: ouch"
         )
         self.check_exceptionobjectargs(
             UnicodeEncodeError,
-            [str8("ascii"), "\xfcx", 0, 1, str8("ouch")],
+            ["ascii", "\xfcx", 0, 1, "ouch"],
             "'ascii' codec can't encode character u'\\xfc' in position 0: ouch"
         )
         self.check_exceptionobjectargs(
             UnicodeEncodeError,
-            [str8("ascii"), "\u0100x", 0, 1, str8("ouch")],
+            ["ascii", "\u0100x", 0, 1, "ouch"],
             "'ascii' codec can't encode character u'\\u0100' in position 0: ouch"
         )
         self.check_exceptionobjectargs(
             UnicodeEncodeError,
-            [str8("ascii"), "\uffffx", 0, 1, str8("ouch")],
+            ["ascii", "\uffffx", 0, 1, "ouch"],
             "'ascii' codec can't encode character u'\\uffff' in position 0: ouch"
         )
         if sys.maxunicode > 0xffff:
             self.check_exceptionobjectargs(
                 UnicodeEncodeError,
-                [str8("ascii"), "\U00010000x", 0, 1, str8("ouch")],
+                ["ascii", "\U00010000x", 0, 1, "ouch"],
                 "'ascii' codec can't encode character u'\\U00010000' in position 0: ouch"
             )
 
     def test_unicodedecodeerror(self):
         self.check_exceptionobjectargs(
             UnicodeDecodeError,
-            [str8("ascii"), b"g\xfcrk", 1, 2, str8("ouch")],
+            ["ascii", b"g\xfcrk", 1, 2, "ouch"],
             "'ascii' codec can't decode byte 0xfc in position 1: ouch"
         )
         self.check_exceptionobjectargs(
             UnicodeDecodeError,
-            [str8("ascii"), b"g\xfcrk", 1, 3, str8("ouch")],
+            ["ascii", b"g\xfcrk", 1, 3, "ouch"],
             "'ascii' codec can't decode bytes in position 1-2: ouch"
         )
 
     def test_unicodetranslateerror(self):
         self.check_exceptionobjectargs(
             UnicodeTranslateError,
-            ["g\xfcrk", 1, 2, str8("ouch")],
+            ["g\xfcrk", 1, 2, "ouch"],
             "can't translate character u'\\xfc' in position 1: ouch"
         )
         self.check_exceptionobjectargs(
             UnicodeTranslateError,
-            ["g\u0100rk", 1, 2, str8("ouch")],
+            ["g\u0100rk", 1, 2, "ouch"],
             "can't translate character u'\\u0100' in position 1: ouch"
         )
         self.check_exceptionobjectargs(
             UnicodeTranslateError,
-            ["g\uffffrk", 1, 2, str8("ouch")],
+            ["g\uffffrk", 1, 2, "ouch"],
             "can't translate character u'\\uffff' in position 1: ouch"
         )
         if sys.maxunicode > 0xffff:
             self.check_exceptionobjectargs(
                 UnicodeTranslateError,
-                ["g\U00010000rk", 1, 2, str8("ouch")],
+                ["g\U00010000rk", 1, 2, "ouch"],
                 "can't translate character u'\\U00010000' in position 1: ouch"
             )
         self.check_exceptionobjectargs(
             UnicodeTranslateError,
-            ["g\xfcrk", 1, 3, str8("ouch")],
+            ["g\xfcrk", 1, 3, "ouch"],
             "can't translate characters in position 1-2: ouch"
         )
 
@@ -416,7 +416,7 @@ class CodecCallbackTest(unittest.TestCase):
         self.assertRaises(
             UnicodeEncodeError,
             codecs.strict_errors,
-            UnicodeEncodeError(str8("ascii"), "\u3042", 0, 1, str8("ouch"))
+            UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")
         )
 
     def test_badandgoodignoreexceptions(self):
@@ -435,17 +435,17 @@ class CodecCallbackTest(unittest.TestCase):
         # If the correct exception is passed in, "ignore" returns an empty replacement
         self.assertEquals(
             codecs.ignore_errors(
-                UnicodeEncodeError(str8("ascii"), "\u3042", 0, 1, str8("ouch"))),
+                UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")),
             ("", 1)
         )
         self.assertEquals(
             codecs.ignore_errors(
-                UnicodeDecodeError(str8("ascii"), b"\xff", 0, 1, str8("ouch"))),
+                UnicodeDecodeError("ascii", b"\xff", 0, 1, "ouch")),
             ("", 1)
         )
         self.assertEquals(
             codecs.ignore_errors(
-                UnicodeTranslateError("\u3042", 0, 1, str8("ouch"))),
+                UnicodeTranslateError("\u3042", 0, 1, "ouch")),
             ("", 1)
         )
 
@@ -475,17 +475,17 @@ class CodecCallbackTest(unittest.TestCase):
         # With the correct exception, "replace" returns an "?" or "\ufffd" replacement
         self.assertEquals(
             codecs.replace_errors(
-                UnicodeEncodeError(str8("ascii"), "\u3042", 0, 1, str8("ouch"))),
+                UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")),
             ("?", 1)
         )
         self.assertEquals(
             codecs.replace_errors(
-                UnicodeDecodeError(str8("ascii"), b"\xff", 0, 1, str8("ouch"))),
+                UnicodeDecodeError("ascii", b"\xff", 0, 1, "ouch")),
             ("\ufffd", 1)
         )
         self.assertEquals(
             codecs.replace_errors(
-                UnicodeTranslateError("\u3042", 0, 1, str8("ouch"))),
+                UnicodeTranslateError("\u3042", 0, 1, "ouch")),
             ("\ufffd", 1)
         )
 
@@ -506,19 +506,19 @@ class CodecCallbackTest(unittest.TestCase):
         self.assertRaises(
             TypeError,
             codecs.xmlcharrefreplace_errors,
-            UnicodeDecodeError(str8("ascii"), b"\xff", 0, 1, str8("ouch"))
+            UnicodeDecodeError("ascii", b"\xff", 0, 1, "ouch")
         )
         self.assertRaises(
             TypeError,
             codecs.xmlcharrefreplace_errors,
-            UnicodeTranslateError("\u3042", 0, 1, str8("ouch"))
+            UnicodeTranslateError("\u3042", 0, 1, "ouch")
         )
         # Use the correct exception
         cs = (0, 1, 9, 10, 99, 100, 999, 1000, 9999, 10000, 0x3042)
         s = "".join(chr(c) for c in cs)
         self.assertEquals(
             codecs.xmlcharrefreplace_errors(
-                UnicodeEncodeError(str8("ascii"), s, 0, len(s), str8("ouch"))
+                UnicodeEncodeError("ascii", s, 0, len(s), "ouch")
             ),
             ("".join("&#%d;" % ord(c) for c in s), len(s))
         )
@@ -540,48 +540,48 @@ class CodecCallbackTest(unittest.TestCase):
         self.assertRaises(
             TypeError,
             codecs.backslashreplace_errors,
-            UnicodeDecodeError(str8("ascii"), b"\xff", 0, 1, str8("ouch"))
+            UnicodeDecodeError("ascii", b"\xff", 0, 1, "ouch")
         )
         self.assertRaises(
             TypeError,
             codecs.backslashreplace_errors,
-            UnicodeTranslateError("\u3042", 0, 1, str8("ouch"))
+            UnicodeTranslateError("\u3042", 0, 1, "ouch")
         )
         # Use the correct exception
         self.assertEquals(
             codecs.backslashreplace_errors(
-                UnicodeEncodeError(str8("ascii"), "\u3042", 0, 1, str8("ouch"))),
+                UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")),
             ("\\u3042", 1)
         )
         self.assertEquals(
             codecs.backslashreplace_errors(
-                UnicodeEncodeError(str8("ascii"), "\x00", 0, 1, str8("ouch"))),
+                UnicodeEncodeError("ascii", "\x00", 0, 1, "ouch")),
             ("\\x00", 1)
         )
         self.assertEquals(
             codecs.backslashreplace_errors(
-                UnicodeEncodeError(str8("ascii"), "\xff", 0, 1, str8("ouch"))),
+                UnicodeEncodeError("ascii", "\xff", 0, 1, "ouch")),
             ("\\xff", 1)
         )
         self.assertEquals(
             codecs.backslashreplace_errors(
-                UnicodeEncodeError(str8("ascii"), "\u0100", 0, 1, str8("ouch"))),
+                UnicodeEncodeError("ascii", "\u0100", 0, 1, "ouch")),
             ("\\u0100", 1)
         )
         self.assertEquals(
             codecs.backslashreplace_errors(
-                UnicodeEncodeError(str8("ascii"), "\uffff", 0, 1, str8("ouch"))),
+                UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")),
             ("\\uffff", 1)
         )
         if sys.maxunicode>0xffff:
             self.assertEquals(
                 codecs.backslashreplace_errors(
-                    UnicodeEncodeError(str8("ascii"), "\U00010000", 0, 1, str8("ouch"))),
+                    UnicodeEncodeError("ascii", "\U00010000", 0, 1, "ouch")),
                 ("\\U00010000", 1)
             )
             self.assertEquals(
                 codecs.backslashreplace_errors(
-                    UnicodeEncodeError(str8("ascii"), "\U0010ffff", 0, 1, str8("ouch"))),
+                    UnicodeEncodeError("ascii", "\U0010ffff", 0, 1, "ouch")),
                 ("\\U0010ffff", 1)
             )
 
index fabf3590d5c44816d11d76484295733e492d326f..36e37955dc376028ec20c08d3d4cd52137e8695b 100644 (file)
@@ -1186,35 +1186,6 @@ set_ssize_t(PyObject **attr, Py_ssize_t value)
     return 0;
 }
 
-static PyObject *
-get_string(PyObject *attr, const char *name)
-{
-    if (!attr) {
-        PyErr_Format(PyExc_TypeError, "%.200s attribute not set", name);
-        return NULL;
-    }
-
-    if (!PyString_Check(attr)) {
-        PyErr_Format(PyExc_TypeError, "%.200s attribute must be str", name);
-        return NULL;
-    }
-    Py_INCREF(attr);
-    return attr;
-}
-
-
-static int
-set_string(PyObject **attr, const char *value)
-{
-    PyObject *obj = PyString_FromString(value);
-    if (!obj)
-        return -1;
-    Py_CLEAR(*attr);
-    *attr = obj;
-    return 0;
-}
-
-
 static PyObject *
 get_bytes(PyObject *attr, const char *name)
 {
@@ -1248,16 +1219,27 @@ get_unicode(PyObject *attr, const char *name)
     return attr;
 }
 
+static int
+set_unicodefromstring(PyObject **attr, const char *value)
+{
+    PyObject *obj = PyUnicode_FromString(value);
+    if (!obj)
+        return -1;
+    Py_CLEAR(*attr);
+    *attr = obj;
+    return 0;
+}
+
 PyObject *
 PyUnicodeEncodeError_GetEncoding(PyObject *exc)
 {
-    return get_string(((PyUnicodeErrorObject *)exc)->encoding, "encoding");
+    return get_unicode(((PyUnicodeErrorObject *)exc)->encoding, "encoding");
 }
 
 PyObject *
 PyUnicodeDecodeError_GetEncoding(PyObject *exc)
 {
-    return get_string(((PyUnicodeErrorObject *)exc)->encoding, "encoding");
+    return get_unicode(((PyUnicodeErrorObject *)exc)->encoding, "encoding");
 }
 
 PyObject *
@@ -1416,42 +1398,45 @@ PyUnicodeTranslateError_SetEnd(PyObject *exc, Py_ssize_t end)
 PyObject *
 PyUnicodeEncodeError_GetReason(PyObject *exc)
 {
-    return get_string(((PyUnicodeErrorObject *)exc)->reason, "reason");
+    return get_unicode(((PyUnicodeErrorObject *)exc)->reason, "reason");
 }
 
 
 PyObject *
 PyUnicodeDecodeError_GetReason(PyObject *exc)
 {
-    return get_string(((PyUnicodeErrorObject *)exc)->reason, "reason");
+    return get_unicode(((PyUnicodeErrorObject *)exc)->reason, "reason");
 }
 
 
 PyObject *
 PyUnicodeTranslateError_GetReason(PyObject *exc)
 {
-    return get_string(((PyUnicodeErrorObject *)exc)->reason, "reason");
+    return get_unicode(((PyUnicodeErrorObject *)exc)->reason, "reason");
 }
 
 
 int
 PyUnicodeEncodeError_SetReason(PyObject *exc, const char *reason)
 {
-    return set_string(&((PyUnicodeErrorObject *)exc)->reason, reason);
+    return set_unicodefromstring(&((PyUnicodeErrorObject *)exc)->reason,
+                                 reason);
 }
 
 
 int
 PyUnicodeDecodeError_SetReason(PyObject *exc, const char *reason)
 {
-    return set_string(&((PyUnicodeErrorObject *)exc)->reason, reason);
+    return set_unicodefromstring(&((PyUnicodeErrorObject *)exc)->reason,
+                                 reason);
 }
 
 
 int
 PyUnicodeTranslateError_SetReason(PyObject *exc, const char *reason)
 {
-    return set_string(&((PyUnicodeErrorObject *)exc)->reason, reason);
+    return set_unicodefromstring(&((PyUnicodeErrorObject *)exc)->reason,
+                                 reason);
 }
 
 
@@ -1466,11 +1451,11 @@ UnicodeError_init(PyUnicodeErrorObject *self, PyObject *args, PyObject *kwds,
     Py_CLEAR(self->reason);
 
     if (!PyArg_ParseTuple(args, "O!O!O!O!O!",
-        &PyString_Type, &self->encoding,
+        &PyUnicode_Type, &self->encoding,
         objecttype, &self->object,
         &PyLong_Type, &self->start,
         &PyLong_Type, &self->end,
-        &PyString_Type, &self->reason)) {
+        &PyUnicode_Type, &self->reason)) {
         self->encoding = self->object = self->start = self->end =
             self->reason = NULL;
         return -1;
@@ -1564,20 +1549,20 @@ UnicodeEncodeError_str(PyObject *self)
             PyOS_snprintf(badchar_str, sizeof(badchar_str), "u%04x", badchar);
         else
             PyOS_snprintf(badchar_str, sizeof(badchar_str), "U%08x", badchar);
-        return PyString_FromFormat(
-            "'%.400s' codec can't encode character u'\\%s' in position %zd: %.400s",
-            PyString_AS_STRING(((PyUnicodeErrorObject *)self)->encoding),
+        return PyUnicode_FromFormat(
+            "'%U' codec can't encode character u'\\%s' in position %zd: %U",
+            ((PyUnicodeErrorObject *)self)->encoding,
             badchar_str,
             start,
-            PyString_AS_STRING(((PyUnicodeErrorObject *)self)->reason)
+            ((PyUnicodeErrorObject *)self)->reason
         );
     }
-    return PyString_FromFormat(
-        "'%.400s' codec can't encode characters in position %zd-%zd: %.400s",
-        PyString_AS_STRING(((PyUnicodeErrorObject *)self)->encoding),
+    return PyUnicode_FromFormat(
+        "'%U' codec can't encode characters in position %zd-%zd: %U",
+        ((PyUnicodeErrorObject *)self)->encoding,
         start,
         (end-1),
-        PyString_AS_STRING(((PyUnicodeErrorObject *)self)->reason)
+        ((PyUnicodeErrorObject *)self)->reason
     );
 }
 
@@ -1601,7 +1586,7 @@ PyUnicodeEncodeError_Create(
     const char *encoding, const Py_UNICODE *object, Py_ssize_t length,
     Py_ssize_t start, Py_ssize_t end, const char *reason)
 {
-    return PyObject_CallFunction(PyExc_UnicodeEncodeError, "su#nns",
+    return PyObject_CallFunction(PyExc_UnicodeEncodeError, "Uu#nnU",
                                  encoding, object, length, start, end, reason);
 }
 
@@ -1626,30 +1611,30 @@ UnicodeDecodeError_str(PyObject *self)
     Py_ssize_t end = 0;
 
     if (PyUnicodeDecodeError_GetStart(self, &start))
-    return NULL;
+        return NULL;
 
     if (PyUnicodeDecodeError_GetEnd(self, &end))
-    return NULL;
+        return NULL;
 
     if (end==start+1) {
         /* FromFormat does not support %02x, so format that separately */
         char byte[4];
         PyOS_snprintf(byte, sizeof(byte), "%02x",
                       ((int)PyBytes_AS_STRING(((PyUnicodeErrorObject *)self)->object)[start])&0xff);
-        return PyString_FromFormat(
-            "'%.400s' codec can't decode byte 0x%s in position %zd: %.400s",
-            PyString_AS_STRING(((PyUnicodeErrorObject *)self)->encoding),
+        return PyUnicode_FromFormat(
+            "'%U' codec can't decode byte 0x%s in position %zd: %U",
+            ((PyUnicodeErrorObject *)self)->encoding,
             byte,
             start,
-            PyString_AS_STRING(((PyUnicodeErrorObject *)self)->reason)
+            ((PyUnicodeErrorObject *)self)->reason
         );
     }
-    return PyString_FromFormat(
-        "'%.400s' codec can't decode bytes in position %zd-%zd: %.400s",
-        PyString_AS_STRING(((PyUnicodeErrorObject *)self)->encoding),
+    return PyUnicode_FromFormat(
+        "'%U' codec can't decode bytes in position %zd-%zd: %U",
+        ((PyUnicodeErrorObject *)self)->encoding,
         start,
         (end-1),
-        PyString_AS_STRING(((PyUnicodeErrorObject *)self)->reason)
+        ((PyUnicodeErrorObject *)self)->reason
     );
 }
 
@@ -1676,7 +1661,7 @@ PyUnicodeDecodeError_Create(
     assert(length < INT_MAX);
     assert(start < INT_MAX);
     assert(end < INT_MAX);
-    return PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
+    return PyObject_CallFunction(PyExc_UnicodeDecodeError, "Uy#nnU",
                                  encoding, object, length, start, end, reason);
 }
 
@@ -1701,7 +1686,7 @@ UnicodeTranslateError_init(PyUnicodeErrorObject *self, PyObject *args,
         &PyUnicode_Type, &self->object,
         &PyLong_Type, &self->start,
         &PyLong_Type, &self->end,
-        &PyString_Type, &self->reason)) {
+        &PyUnicode_Type, &self->reason)) {
         self->object = self->start = self->end = self->reason = NULL;
         return -1;
     }
@@ -1736,18 +1721,18 @@ UnicodeTranslateError_str(PyObject *self)
             PyOS_snprintf(badchar_str, sizeof(badchar_str), "u%04x", badchar);
         else
             PyOS_snprintf(badchar_str, sizeof(badchar_str), "U%08x", badchar);
-        return PyString_FromFormat(
-            "can't translate character u'\\%s' in position %zd: %.400s",
+        return PyUnicode_FromFormat(
+            "can't translate character u'\\%s' in position %zd: %U",
             badchar_str,
             start,
-            PyString_AS_STRING(((PyUnicodeErrorObject *)self)->reason)
+            ((PyUnicodeErrorObject *)self)->reason
         );
     }
-    return PyString_FromFormat(
-        "can't translate characters in position %zd-%zd: %.400s",
+    return PyUnicode_FromFormat(
+        "can't translate characters in position %zd-%zd: %U",
         start,
         (end-1),
-        PyString_AS_STRING(((PyUnicodeErrorObject *)self)->reason)
+        ((PyUnicodeErrorObject *)self)->reason
     );
 }
 
index 7e455a5e328778295aadb86be1edc044e7f657ca..e77b65dd2c495b5a564b873020f2ac3344961353 100644 (file)
@@ -393,15 +393,9 @@ PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
     return (PyObject *)unicode;
 }
 
-PyObject *PyUnicode_FromString(const char *u)
+PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
 {
     PyUnicodeObject *unicode;
-    size_t size = strlen(u);
-    if (size > PY_SSIZE_T_MAX) {
-        PyErr_SetString(PyExc_OverflowError, "input too long");
-        return NULL;
-    }
-
     /* If the Unicode data is known at construction time, we can apply
        some optimizations which share commonly used objects. */
     if (u != NULL) {
@@ -441,6 +435,17 @@ PyObject *PyUnicode_FromString(const char *u)
     return (PyObject *)unicode;
 }
 
+PyObject *PyUnicode_FromString(const char *u)
+{
+    size_t size = strlen(u);
+    if (size > PY_SSIZE_T_MAX) {
+        PyErr_SetString(PyExc_OverflowError, "input too long");
+        return NULL;
+    }
+
+    return PyUnicode_FromStringAndSize(u, size);
+}
+
 #ifdef HAVE_WCHAR_H
 
 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
@@ -473,6 +478,223 @@ PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
     return (PyObject *)unicode;
 }
 
+#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
+
+PyObject *
+PyUnicode_FromFormatV(const char *format, va_list vargs)
+{
+       va_list count;
+       Py_ssize_t n = 0;
+       const char* f;
+       Py_UNICODE *s;
+       PyObject *string;
+       /* used by sprintf */
+       char buffer[21];
+       const char *copy;
+
+#ifdef VA_LIST_IS_ARRAY
+       Py_MEMCPY(count, vargs, sizeof(va_list));
+#else
+#ifdef  __va_copy
+       __va_copy(count, vargs);
+#else
+       count = vargs;
+#endif
+#endif
+       /* step 1: figure out how large a buffer we need */
+       for (f = format; *f; f++) {
+               if (*f == '%') {
+                       const char* p = f;
+                       while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
+                               ;
+
+                       /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
+                        * they don't affect the amount of space we reserve.
+                        */
+                       if ((*f == 'l' || *f == 'z') &&
+                                       (f[1] == 'd' || f[1] == 'u'))
+                               ++f;
+
+                       switch (*f) {
+                       case 'c':
+                               (void)va_arg(count, int);
+                               /* fall through... */
+                       case '%':
+                               n++;
+                               break;
+                       case 'd': case 'u': case 'i': case 'x':
+                               (void) va_arg(count, int);
+                               /* 20 bytes is enough to hold a 64-bit
+                                  integer.  Decimal takes the most space.
+                                  This isn't enough for octal. */
+                               n += 20;
+                               break;
+                       case 's':
+                               n += strlen(va_arg(count, char*));
+                               break;
+                       case 'U':
+                       {
+                               PyObject *obj = va_arg(count, PyObject *);
+                               assert(obj && PyUnicode_Check(obj));
+                               n += PyUnicode_GET_SIZE(obj);
+                               break;
+                       }
+                       case 'p':
+                               (void) va_arg(count, int);
+                               /* maximum 64-bit pointer representation:
+                                * 0xffffffffffffffff
+                                * so 19 characters is enough.
+                                * XXX I count 18 -- what's the extra for?
+                                */
+                               n += 19;
+                               break;
+                       default:
+                               /* if we stumble upon an unknown
+                                  formatting code, copy the rest of
+                                  the format string to the output
+                                  string. (we cannot just skip the
+                                  code, since there's no way to know
+                                  what's in the argument list) */
+                               n += strlen(p);
+                               goto expand;
+                       }
+               } else
+                       n++;
+       }
+ expand:
+       /* step 2: fill the buffer */
+       /* Since we've analyzed how much space we need for the worst case,
+          we don't have to resize the string. */
+       string = PyUnicode_FromUnicode(NULL, n);
+       if (!string)
+               return NULL;
+
+       s = PyUnicode_AS_UNICODE(string);
+
+       for (f = format; *f; f++) {
+               if (*f == '%') {
+                       const char* p = f++;
+                       int longflag = 0;
+                       int size_tflag = 0;
+                       /* parse the width.precision part (we're only
+                          interested in the precision value, if any) */
+                       n = 0;
+                       while (isdigit(Py_CHARMASK(*f)))
+                               n = (n*10) + *f++ - '0';
+                       if (*f == '.') {
+                               f++;
+                               n = 0;
+                               while (isdigit(Py_CHARMASK(*f)))
+                                       n = (n*10) + *f++ - '0';
+                       }
+                       while (*f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
+                               f++;
+                       /* handle the long flag, but only for %ld and %lu.
+                          others can be added when necessary. */
+                       if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
+                               longflag = 1;
+                               ++f;
+                       }
+                       /* handle the size_t flag. */
+                       if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
+                               size_tflag = 1;
+                               ++f;
+                       }
+
+                       switch (*f) {
+                       case 'c':
+                               *s++ = va_arg(vargs, int);
+                               break;
+                       case 'd':
+                               if (longflag)
+                                       sprintf(buffer, "%ld", va_arg(vargs, long));
+                               else if (size_tflag)
+                                       sprintf(buffer, "%" PY_FORMAT_SIZE_T "d",
+                                               va_arg(vargs, Py_ssize_t));
+                               else
+                                       sprintf(buffer, "%d", va_arg(vargs, int));
+                               appendstring(buffer);
+                               break;
+                       case 'u':
+                               if (longflag)
+                                       sprintf(buffer, "%lu",
+                                               va_arg(vargs, unsigned long));
+                               else if (size_tflag)
+                                       sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
+                                               va_arg(vargs, size_t));
+                               else
+                                       sprintf(buffer, "%u",
+                                               va_arg(vargs, unsigned int));
+                               appendstring(buffer);
+                               break;
+                       case 'i':
+                               sprintf(buffer, "%i", va_arg(vargs, int));
+                               appendstring(buffer);
+                               break;
+                       case 'x':
+                               sprintf(buffer, "%x", va_arg(vargs, int));
+                               appendstring(buffer);
+                               break;
+                       case 's':
+                               p = va_arg(vargs, char*);
+                               appendstring(p);
+                               break;
+                       case 'U':
+                       {
+                               PyObject *obj = va_arg(vargs, PyObject *);
+                               Py_UNICODE *ucopy = PyUnicode_AS_UNICODE(obj);
+                               Py_ssize_t usize = PyUnicode_GET_SIZE(obj);
+                               Py_ssize_t upos;
+                               for (upos = 0; upos<usize;)
+                                       *s++ = ucopy[upos++];
+                               break;
+                       }
+                       case 'p':
+                               sprintf(buffer, "%p", va_arg(vargs, void*));
+                               /* %p is ill-defined:  ensure leading 0x. */
+                               if (buffer[1] == 'X')
+                                       buffer[1] = 'x';
+                               else if (buffer[1] != 'x') {
+                                       memmove(buffer+2, buffer, strlen(buffer)+1);
+                                       buffer[0] = '0';
+                                       buffer[1] = 'x';
+                               }
+                               appendstring(buffer);
+                               break;
+                       case '%':
+                               *s++ = '%';
+                               break;
+                       default:
+                               appendstring(p);
+                               goto end;
+                       }
+               } else
+                       *s++ = *f;
+       }
+
+ end:
+       _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
+       return string;
+}
+
+#undef appendstring
+
+PyObject *
+PyUnicode_FromFormat(const char *format, ...)
+{
+       PyObject* ret;
+       va_list vargs;
+
+#ifdef HAVE_STDARG_PROTOTYPES
+       va_start(vargs, format);
+#else
+       va_start(vargs);
+#endif
+       ret = PyUnicode_FromFormatV(format, vargs);
+       va_end(vargs);
+       return ret;
+}
+
 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
                                wchar_t *w,
                                Py_ssize_t size)
index 8f600dc3459629e2a4b989e82d22cd1d9c645ba0..a272ce31354f3d1304fbe9ac6dadf77b0f3e9cbb 100644 (file)
@@ -424,6 +424,39 @@ do_mkvalue(const char **p_format, va_list *p_va, int flags)
                        return v;
                }
 
+               case 'U':
+               {
+                       PyObject *v;
+                       char *str = va_arg(*p_va, char *);
+                       Py_ssize_t n;
+                       if (**p_format == '#') {
+                               ++*p_format;
+                               if (flags & FLAG_SIZE_T)
+                                       n = va_arg(*p_va, Py_ssize_t);
+                               else
+                                       n = va_arg(*p_va, int);
+                       }
+                       else
+                               n = -1;
+                       if (str == NULL) {
+                               v = Py_None;
+                               Py_INCREF(v);
+                       }
+                       else {
+                               if (n < 0) {
+                                       size_t m = strlen(str);
+                                       if (m > PY_SSIZE_T_MAX) {
+                                               PyErr_SetString(PyExc_OverflowError,
+                                                       "string too long for Python string");
+                                               return NULL;
+                                       }
+                                       n = (Py_ssize_t)m;
+                               }
+                               v = PyUnicode_FromStringAndSize(str, n);
+                       }
+                       return v;
+               }
+
                case 'y':
                {
                        PyObject *v;