SF patch 549375: Compromise PyUnicode_EncodeUTF8

author Tim Peters <tim.peters@gmail.com>

Sat, 27 Apr 2002 18:03:26 +0000 (18:03 +0000)

committer Tim Peters <tim.peters@gmail.com>

Sat, 27 Apr 2002 18:03:26 +0000 (18:03 +0000)
author Tim Peters <tim.peters@gmail.com>
Sat, 27 Apr 2002 18:03:26 +0000 (18:03 +0000)
committer Tim Peters <tim.peters@gmail.com>
Sat, 27 Apr 2002 18:03:26 +0000 (18:03 +0000)
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 54db9aaddf0d266abb27704c56c893e13d964018..c4c8d8d64098e84db5a6e57976238a0beca43908 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1138,142 +1138,104 @@ onError:
      return NULL;
  }
  
-/* Not used anymore, now that the encoder supports UTF-16
-   surrogates. */
-#if 0
-static
-int utf8_encoding_error(const Py_UNICODE **source,
-                       char **dest,
-                       const char *errors,
-                       const char *details) 
-{
-    if ((errors == NULL) ||
-       (strcmp(errors,"strict") == 0)) {
-       PyErr_Format(PyExc_UnicodeError,
-                    "UTF-8 encoding error: %.400s",
-                    details);
-       return -1;
-    }
-    else if (strcmp(errors,"ignore") == 0) {
-       return 0;
-    }
-    else if (strcmp(errors,"replace") == 0) {
-       **dest = '?';
-       (*dest)++;
-       return 0;
-    }
-    else {
-       PyErr_Format(PyExc_ValueError,
-                    "UTF-8 encoding error; "
-                    "unknown error handling code: %.400s",
-                    errors);
-       return -1;
-    }
-}
-#endif
-
-/* Allocation strategy: we default to Latin-1, then do one resize
-   whenever we hit an order boundary. The assumption is that
-   characters from higher orders usually occur often enough to warrant
-   this.
+/* Allocation strategy:  if the string is short, convert into a stack buffer
+   and allocate exactly as much space needed at the end.  Else allocate the
+   maximum possible needed (4 result bytes per Unicode character), and return
+   the excess memory at the end.
  */
-
  PyObject *
  PyUnicode_EncodeUTF8(const Py_UNICODE *s,
                      int size,
                      const char *errors)
  {
-    PyObject *v;
-    char *p;
-    int len;
-    int i = 0;
-    long overalloc = 2;
-    int nallocated;  /* overalloc * size; PyString_ adds one more for \0 */
+#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
  
-    /* Short-cut for empty strings */
-    if (size == 0)
-       return PyString_FromStringAndSize(NULL, 0);
+    int i;              /* index into s of next input byte */
+    PyObject *v;        /* result string object */
+    char *p;            /* next free byte in output buffer */
+    int nallocated;     /* number of result bytes allocated */
+    int nneeded;        /* number of result bytes needed */
+    char stackbuf[MAX_SHORT_UNICHARS * 4];
  
-    nallocated = Py_SAFE_DOWNCAST(overalloc * size, long, int);
-    v = PyString_FromStringAndSize(NULL, nallocated);
-    if (v == NULL)
-        return NULL;
+    assert(s != NULL);
+    assert(size >= 0);
  
-    p = PyString_AS_STRING(v);
+    if (size <= MAX_SHORT_UNICHARS) {
+        /* Write into the stack buffer; nallocated can't overflow.
+         * At the end, we'll allocate exactly as much heap space as it
+         * turns out we need.
+         */
+        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
+        v = NULL;   /* will allocate after we're done */
+        p = stackbuf;
+    }
+    else {
+        /* Overallocate on the heap, and give the excess back at the end. */
+        nallocated = size * 4;
+        if (nallocated / 4 != size)  /* overflow! */
+            return PyErr_NoMemory();
+        v = PyString_FromStringAndSize(NULL, nallocated);
+        if (v == NULL)
+            return NULL;
+        p = PyString_AS_STRING(v);
+    }
  
-    while (i < size) {
+    for (i = 0; i < size;) {
          Py_UCS4 ch = s[i++];
  
          if (ch < 0x80)
-           /* Encode ASCII */
+            /* Encode ASCII */
              *p++ = (char) ch;
  
          else if (ch < 0x0800) {
-           /* Encode Latin-1 */
+            /* Encode Latin-1 */
              *p++ = (char)(0xc0 | (ch >> 6));
              *p++ = (char)(0x80 | (ch & 0x3f));
          }
-
          else {
-           /* Encode UCS2 Unicode ordinals */
-           if (ch < 0x10000) {
-
-               /* Special case: check for high surrogate */
-               if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
-                   Py_UCS4 ch2 = s[i];
-                   /* Check for low surrogate and combine the two to
-                      form a UCS4 value */
-                   if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
+            /* Encode UCS2 Unicode ordinals */
+            if (ch < 0x10000) {
+                /* Special case: check for high surrogate */
+                if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
+                    Py_UCS4 ch2 = s[i];
+                    /* Check for low surrogate and combine the two to
+                       form a UCS4 value */
+                    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
                          ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
-                       i++;
-                       goto encodeUCS4;
+                        i++;
+                        goto encodeUCS4;
                      }
-                   /* Fall through: handles isolated high surrogates */
+                    /* Fall through: handles isolated high surrogates */
                  }
-
-               if (overalloc < 3) {
-                   len = Py_SAFE_DOWNCAST(p-PyString_AS_STRING(v), long, int);
-                    assert(len <= nallocated);
-                   overalloc = 3;
-                    nallocated = Py_SAFE_DOWNCAST(overalloc * size, long, int);
-                   if (_PyString_Resize(&v, nallocated))
-                       goto onError;
-                   p = PyString_AS_STRING(v) + len;
-               }
                  *p++ = (char)(0xe0 | (ch >> 12));
-               *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
-               *p++ = (char)(0x80 | (ch & 0x3f));
-               continue;
-           }
-
-           /* Encode UCS4 Unicode ordinals */
-       encodeUCS4:
-           if (overalloc < 4) {
-                len = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
-                assert(len <= nallocated);
-               overalloc = 4;
-                nallocated = Py_SAFE_DOWNCAST(overalloc * size, long, int);
-               if (_PyString_Resize(&v, nallocated))
-                   goto onError;
-               p = PyString_AS_STRING(v) + len;
-           }
-           *p++ = (char)(0xf0 | (ch >> 18));
-           *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
-           *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
-           *p++ = (char)(0x80 | (ch & 0x3f));
-       }
+                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
+                *p++ = (char)(0x80 | (ch & 0x3f));
+                continue;
+           }
+encodeUCS4:
+            /* Encode UCS4 Unicode ordinals */
+            *p++ = (char)(0xf0 | (ch >> 18));
+            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
+            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
+            *p++ = (char)(0x80 | (ch & 0x3f));
+        }
      }
  
-    *p = '\0';
-    len = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
-    assert(len <= nallocated);
-    if (_PyString_Resize(&v, len))
-       goto onError;
+    if (v == NULL) {
+        /* This was stack allocated. */
+        nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
+        assert(nneeded <= nallocated);
+        v = PyString_FromStringAndSize(stackbuf, nneeded);
+    }
+    else {
+       /* Cut back to size actually needed. */
+        nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
+        assert(nneeded <= nallocated);
+        _PyString_Resize(&v, nneeded);
+    }
      return v;
  
- onError:
-    Py_DECREF(v);
-    return NULL;
+#undef MAX_SHORT_UNICHARS
  }
  
  PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
author	Tim Peters <tim.peters@gmail.com>
	Sat, 27 Apr 2002 18:03:26 +0000 (18:03 +0000)
committer	Tim Peters <tim.peters@gmail.com>
	Sat, 27 Apr 2002 18:03:26 +0000 (18:03 +0000)