return NULL;
}
-/* Not used anymore, now that the encoder supports UTF-16
- surrogates. */
-#if 0
-static
-int utf8_encoding_error(const Py_UNICODE **source,
- char **dest,
- const char *errors,
- const char *details)
-{
- if ((errors == NULL) ||
- (strcmp(errors,"strict") == 0)) {
- PyErr_Format(PyExc_UnicodeError,
- "UTF-8 encoding error: %.400s",
- details);
- return -1;
- }
- else if (strcmp(errors,"ignore") == 0) {
- return 0;
- }
- else if (strcmp(errors,"replace") == 0) {
- **dest = '?';
- (*dest)++;
- return 0;
- }
- else {
- PyErr_Format(PyExc_ValueError,
- "UTF-8 encoding error; "
- "unknown error handling code: %.400s",
- errors);
- return -1;
- }
-}
-#endif
-
-/* Allocation strategy: we default to Latin-1, then do one resize
- whenever we hit an order boundary. The assumption is that
- characters from higher orders usually occur often enough to warrant
- this.
+/* Allocation strategy: if the string is short, convert into a stack buffer
+ and allocate exactly as much space needed at the end. Else allocate the
+ maximum possible needed (4 result bytes per Unicode character), and return
+ the excess memory at the end.
*/
-
PyObject *
PyUnicode_EncodeUTF8(const Py_UNICODE *s,
int size,
const char *errors)
{
- PyObject *v;
- char *p;
- int len;
- int i = 0;
- long overalloc = 2;
- int nallocated; /* overalloc * size; PyString_ adds one more for \0 */
+#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
- /* Short-cut for empty strings */
- if (size == 0)
- return PyString_FromStringAndSize(NULL, 0);
+ int i; /* index into s of next input byte */
+ PyObject *v; /* result string object */
+ char *p; /* next free byte in output buffer */
+ int nallocated; /* number of result bytes allocated */
+ int nneeded; /* number of result bytes needed */
+ char stackbuf[MAX_SHORT_UNICHARS * 4];
- nallocated = Py_SAFE_DOWNCAST(overalloc * size, long, int);
- v = PyString_FromStringAndSize(NULL, nallocated);
- if (v == NULL)
- return NULL;
+ assert(s != NULL);
+ assert(size >= 0);
- p = PyString_AS_STRING(v);
+ if (size <= MAX_SHORT_UNICHARS) {
+ /* Write into the stack buffer; nallocated can't overflow.
+ * At the end, we'll allocate exactly as much heap space as it
+ * turns out we need.
+ */
+ nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
+ v = NULL; /* will allocate after we're done */
+ p = stackbuf;
+ }
+ else {
+ /* Overallocate on the heap, and give the excess back at the end. */
+ nallocated = size * 4;
+ if (nallocated / 4 != size) /* overflow! */
+ return PyErr_NoMemory();
+ v = PyString_FromStringAndSize(NULL, nallocated);
+ if (v == NULL)
+ return NULL;
+ p = PyString_AS_STRING(v);
+ }
- while (i < size) {
+ for (i = 0; i < size;) {
Py_UCS4 ch = s[i++];
if (ch < 0x80)
- /* Encode ASCII */
+ /* Encode ASCII */
*p++ = (char) ch;
else if (ch < 0x0800) {
- /* Encode Latin-1 */
+ /* Encode Latin-1 */
*p++ = (char)(0xc0 | (ch >> 6));
*p++ = (char)(0x80 | (ch & 0x3f));
}
-
else {
- /* Encode UCS2 Unicode ordinals */
- if (ch < 0x10000) {
-
- /* Special case: check for high surrogate */
- if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
- Py_UCS4 ch2 = s[i];
- /* Check for low surrogate and combine the two to
- form a UCS4 value */
- if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
+ /* Encode UCS2 Unicode ordinals */
+ if (ch < 0x10000) {
+ /* Special case: check for high surrogate */
+ if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
+ Py_UCS4 ch2 = s[i];
+ /* Check for low surrogate and combine the two to
+ form a UCS4 value */
+ if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
- i++;
- goto encodeUCS4;
+ i++;
+ goto encodeUCS4;
}
- /* Fall through: handles isolated high surrogates */
+ /* Fall through: handles isolated high surrogates */
}
-
- if (overalloc < 3) {
- len = Py_SAFE_DOWNCAST(p-PyString_AS_STRING(v), long, int);
- assert(len <= nallocated);
- overalloc = 3;
- nallocated = Py_SAFE_DOWNCAST(overalloc * size, long, int);
- if (_PyString_Resize(&v, nallocated))
- goto onError;
- p = PyString_AS_STRING(v) + len;
- }
*p++ = (char)(0xe0 | (ch >> 12));
- *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
- *p++ = (char)(0x80 | (ch & 0x3f));
- continue;
- }
-
- /* Encode UCS4 Unicode ordinals */
- encodeUCS4:
- if (overalloc < 4) {
- len = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
- assert(len <= nallocated);
- overalloc = 4;
- nallocated = Py_SAFE_DOWNCAST(overalloc * size, long, int);
- if (_PyString_Resize(&v, nallocated))
- goto onError;
- p = PyString_AS_STRING(v) + len;
- }
- *p++ = (char)(0xf0 | (ch >> 18));
- *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
- *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
- *p++ = (char)(0x80 | (ch & 0x3f));
- }
+ *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
+ *p++ = (char)(0x80 | (ch & 0x3f));
+ continue;
+ }
+encodeUCS4:
+ /* Encode UCS4 Unicode ordinals */
+ *p++ = (char)(0xf0 | (ch >> 18));
+ *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
+ *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
+ *p++ = (char)(0x80 | (ch & 0x3f));
+ }
}
- *p = '\0';
- len = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
- assert(len <= nallocated);
- if (_PyString_Resize(&v, len))
- goto onError;
+ if (v == NULL) {
+ /* This was stack allocated. */
+ nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
+ assert(nneeded <= nallocated);
+ v = PyString_FromStringAndSize(stackbuf, nneeded);
+ }
+ else {
+ /* Cut back to size actually needed. */
+ nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
+ assert(nneeded <= nallocated);
+ _PyString_Resize(&v, nneeded);
+ }
return v;
- onError:
- Py_DECREF(v);
- return NULL;
+#undef MAX_SHORT_UNICHARS
}
PyObject *PyUnicode_AsUTF8String(PyObject *unicode)