Change PyUnicode_FromString[AndSize] to expect UTF-8.

author Martin v. Löwis <martin@v.loewis.de>

Sun, 5 Aug 2007 20:26:11 +0000 (20:26 +0000)

committer Martin v. Löwis <martin@v.loewis.de>

Sun, 5 Aug 2007 20:26:11 +0000 (20:26 +0000)
author Martin v. Löwis <martin@v.loewis.de>
Sun, 5 Aug 2007 20:26:11 +0000 (20:26 +0000)
committer Martin v. Löwis <martin@v.loewis.de>
Sun, 5 Aug 2007 20:26:11 +0000 (20:26 +0000)
diff --git a/Doc/api/concrete.tex b/Doc/api/concrete.tex

index afba52fbec8219367b5a83956817c93831bb614f..4dc3f4c0d8988c5662a4ec528a009cfb44e30f16 100644 (file)
--- a/Doc/api/concrete.tex
+++ b/Doc/api/concrete.tex
@@ -996,10 +996,11 @@ use these APIs:
    \var{u} is \NULL{}.
  \end{cfuncdesc}
  
-\begin{cfuncdesc}{PyObject*}{PyUnicode_FromString}{const char *u}
+\begin{cfuncdesc}{PyObject*}{PyUnicode_FromStringAndSize}{const char *u,
+                                                       Py_ssize_t size}
    Create a Unicode Object from the char buffer \var{u}.
-  \var{u} must be 0-terminated, the bytes will be interpreted as
-  being latin-1 encoded. \var{u} may also be \NULL{} which causes the
+  The bytes will be interpreted as being UTF-8 encoded. 
+  \var{u} may also be \NULL{} which causes the
    contents to be undefined. It is the user's responsibility to fill
    in the needed data.  The buffer is copied into the new object.
    If the buffer is not \NULL{}, the return value might be a shared object.
@@ -1008,6 +1009,12 @@ use these APIs:
    \versionadded{3.0}
  \end{cfuncdesc}
  
+\begin{cfuncdesc}{PyObject*}{PyUnicode_FromString}{const char*u}
+   Create a Unicode object from an UTF-8 encoded null-terminated
+   char buffer \var{u}.
+   \versionadded{3.0}
+\end{funcdesc}
+
  \begin{cfuncdesc}{PyObject*}{PyUnicode_FromFormat}{const char *format, ...}
    Take a C \cfunction{printf()}-style \var{format} string and a
    variable number of arguments, calculate the size of the resulting
diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c

index 25f77634db98a84e5be9cb0843a30e75cf5ecbb8..47ee8a4082ff9555c9e38872b4004246e820fed8 100644 (file)
--- a/Objects/bytesobject.c
+++ b/Objects/bytesobject.c
@@ -2724,11 +2724,13 @@ PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
  static PyObject *
  bytes_reduce(PyBytesObject *self)
  {
-    return Py_BuildValue("(O(s#s))",
-                         Py_Type(self),
-                         self->ob_bytes == NULL ? "" : self->ob_bytes,
-                         Py_Size(self),
-                         "latin-1");
+    PyObject *latin1;
+    if (self->ob_bytes)
+       latin1 = PyUnicode_DecodeLatin1(self->ob_bytes, 
+                                       Py_Size(self), NULL);
+    else
+       latin1 = PyUnicode_FromString("");
+    return Py_BuildValue("(O(Ns))", Py_Type(self), latin1, "latin-1");
  }
  
  static PySequenceMethods bytes_as_sequence = {
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index d1b5747f5798ccbf3e5e655910a727088ffc31e2..27fedca463a49882ce996d9f7eceef306a4e6c6b 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -427,7 +427,9 @@ PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
  {
      PyUnicodeObject *unicode;
      /* If the Unicode data is known at construction time, we can apply
-       some optimizations which share commonly used objects. */
+       some optimizations which share commonly used objects.
+       Also, this means the input must be UTF-8, so fall back to the
+       UTF-8 decoder at the end. */
      if (u != NULL) {
  
         /* Optimization for empty strings */
@@ -436,8 +438,9 @@ PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
             return (PyObject *)unicode_empty;
         }
  
-       /* Single characters are shared when using this constructor */
-       if (size == 1) {
+       /* Single characters are shared when using this constructor.
+           Restrict to ASCII, since the input must be UTF-8. */
+       if (size == 1 && Py_CHARMASK(*u) < 128) {
             unicode = unicode_latin1[Py_CHARMASK(*u)];
             if (!unicode) {
                 unicode = _PyUnicode_New(1);
@@ -449,21 +452,14 @@ PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
             Py_INCREF(unicode);
             return (PyObject *)unicode;
         }
+
+        return PyUnicode_DecodeUTF8(u, size, NULL);
      }
  
      unicode = _PyUnicode_New(size);
      if (!unicode)
          return NULL;
  
-    /* Copy the Unicode data into the new object */
-    if (u != NULL) {
-        Py_UNICODE *p = unicode->str;
-        while (size--)
-            *p++ = Py_CHARMASK(*u++);
-        /* Don't need to write trailing 0 because
-           that's already done by _PyUnicode_New */
-    }
-
      return (PyObject *)unicode;
  }
author	Martin v. Löwis <martin@v.loewis.de>
	Sun, 5 Aug 2007 20:26:11 +0000 (20:26 +0000)
committer	Martin v. Löwis <martin@v.loewis.de>
	Sun, 5 Aug 2007 20:26:11 +0000 (20:26 +0000)
Doc/api/concrete.tex		patch \| blob \| history
Objects/bytesobject.c		patch \| blob \| history
Objects/unicodeobject.c		patch \| blob \| history