From: Serhiy Storchaka Date: Sat, 26 Jan 2013 10:16:36 +0000 (+0200) Subject: Issue #10156: In the interpreter's initialization phase, unicode globals X-Git-Tag: v3.3.1rc1~288 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=678db84b3724f99d6e4950ecf4eb7c1f79a47b2f;p=python Issue #10156: In the interpreter's initialization phase, unicode globals are now initialized dynamically as needed. --- 678db84b3724f99d6e4950ecf4eb7c1f79a47b2f diff --cc Misc/NEWS index 9d12d88e98,c216cc9440..108857bdfb --- a/Misc/NEWS +++ b/Misc/NEWS @@@ -12,9 -10,9 +12,12 @@@ What's New in Python 3.3.1 Core and Builtins ----------------- + - Issue #10156: In the interpreter's initialization phase, unicode globals + are now initialized dynamically as needed. + +- Issue #16980: Fix processing of escaped non-ascii bytes in the + unicode-escape-decode decoder. + - Issue #16975: Fix error handling bug in the escape-decode bytes decoder. - Issue #14850: Now a charmap decoder treats U+FFFE as "undefined mapping" diff --cc Objects/unicodeobject.c index a2ddf3e578,92d17771e2..c96a91c397 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@@ -179,17 -99,30 +180,36 @@@ extern "C" Another way to look at this is that to say that the actual reference count of a string is: s->ob_refcnt + (s->state ? 2 : 0) */ - static PyObject *interned; + static PyObject *interned = NULL; -/* Free list for Unicode objects */ -static PyUnicodeObject *free_list = NULL; -static int numfree = 0; - /* The empty Unicode object is shared to improve performance. */ - static PyObject *unicode_empty; -static PyUnicodeObject *unicode_empty = NULL; ++static PyObject *unicode_empty = NULL; + -#define _Py_RETURN_UNICODE_EMPTY() \ ++#define _Py_INCREF_UNICODE_EMPTY() \ + do { \ + if (unicode_empty != NULL) \ + Py_INCREF(unicode_empty); \ + else { \ - unicode_empty = _PyUnicode_New(0); \ - if (unicode_empty != NULL) \ ++ unicode_empty = PyUnicode_New(0, 0); \ ++ if (unicode_empty != NULL) { \ + Py_INCREF(unicode_empty); \ ++ assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \ ++ } \ + } \ - return (PyObject *)unicode_empty; \ + } while (0) + ++#define _Py_RETURN_UNICODE_EMPTY() \ ++ do { \ ++ _Py_INCREF_UNICODE_EMPTY(); \ ++ return unicode_empty; \ ++ } while (0) + +/* List of static strings. */ - static _Py_Identifier *static_strings; ++static _Py_Identifier *static_strings = NULL; + /* Single character Unicode strings in the Latin-1 range are being shared as well. */ - static PyObject *unicode_latin1[256]; -static PyUnicodeObject *unicode_latin1[256] = {NULL}; ++static PyObject *unicode_latin1[256] = {NULL}; /* Fast detection of the most frequent whitespace characters */ const unsigned char _Py_ascii_whitespace[] = { @@@ -290,224 -207,6 +310,223 @@@ PyUnicode_GetMax(void #endif } +#ifdef Py_DEBUG +int +_PyUnicode_CheckConsistency(PyObject *op, int check_content) +{ + PyASCIIObject *ascii; + unsigned int kind; + + assert(PyUnicode_Check(op)); + + ascii = (PyASCIIObject *)op; + kind = ascii->state.kind; + + if (ascii->state.ascii == 1 && ascii->state.compact == 1) { + assert(kind == PyUnicode_1BYTE_KIND); + assert(ascii->state.ready == 1); + } + else { + PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; + void *data; + + if (ascii->state.compact == 1) { + data = compact + 1; + assert(kind == PyUnicode_1BYTE_KIND + || kind == PyUnicode_2BYTE_KIND + || kind == PyUnicode_4BYTE_KIND); + assert(ascii->state.ascii == 0); + assert(ascii->state.ready == 1); + assert (compact->utf8 != data); + } + else { + PyUnicodeObject *unicode = (PyUnicodeObject *)op; + + data = unicode->data.any; + if (kind == PyUnicode_WCHAR_KIND) { + assert(ascii->length == 0); + assert(ascii->hash == -1); + assert(ascii->state.compact == 0); + assert(ascii->state.ascii == 0); + assert(ascii->state.ready == 0); + assert(ascii->state.interned == SSTATE_NOT_INTERNED); + assert(ascii->wstr != NULL); + assert(data == NULL); + assert(compact->utf8 == NULL); + } + else { + assert(kind == PyUnicode_1BYTE_KIND + || kind == PyUnicode_2BYTE_KIND + || kind == PyUnicode_4BYTE_KIND); + assert(ascii->state.compact == 0); + assert(ascii->state.ready == 1); + assert(data != NULL); + if (ascii->state.ascii) { + assert (compact->utf8 == data); + assert (compact->utf8_length == ascii->length); + } + else + assert (compact->utf8 != data); + } + } + if (kind != PyUnicode_WCHAR_KIND) { + if ( +#if SIZEOF_WCHAR_T == 2 + kind == PyUnicode_2BYTE_KIND +#else + kind == PyUnicode_4BYTE_KIND +#endif + ) + { + assert(ascii->wstr == data); + assert(compact->wstr_length == ascii->length); + } else + assert(ascii->wstr != data); + } + + if (compact->utf8 == NULL) + assert(compact->utf8_length == 0); + if (ascii->wstr == NULL) + assert(compact->wstr_length == 0); + } + /* check that the best kind is used */ + if (check_content && kind != PyUnicode_WCHAR_KIND) + { + Py_ssize_t i; + Py_UCS4 maxchar = 0; + void *data; + Py_UCS4 ch; + + data = PyUnicode_DATA(ascii); + for (i=0; i < ascii->length; i++) + { + ch = PyUnicode_READ(kind, data, i); + if (ch > maxchar) + maxchar = ch; + } + if (kind == PyUnicode_1BYTE_KIND) { + if (ascii->state.ascii == 0) { + assert(maxchar >= 128); + assert(maxchar <= 255); + } + else + assert(maxchar < 128); + } + else if (kind == PyUnicode_2BYTE_KIND) { + assert(maxchar >= 0x100); + assert(maxchar <= 0xFFFF); + } + else { + assert(maxchar >= 0x10000); + assert(maxchar <= MAX_UNICODE); + } + assert(PyUnicode_READ(kind, data, ascii->length) == 0); + } + return 1; +} +#endif + +static PyObject* +unicode_result_wchar(PyObject *unicode) +{ +#ifndef Py_DEBUG + Py_ssize_t len; + + assert(Py_REFCNT(unicode) == 1); + + len = _PyUnicode_WSTR_LENGTH(unicode); + if (len == 0) { - Py_INCREF(unicode_empty); + Py_DECREF(unicode); - return unicode_empty; ++ _Py_RETURN_UNICODE_EMPTY(); + } + + if (len == 1) { + wchar_t ch = _PyUnicode_WSTR(unicode)[0]; + if (ch < 256) { + PyObject *latin1_char = get_latin1_char((unsigned char)ch); + Py_DECREF(unicode); + return latin1_char; + } + } + + if (_PyUnicode_Ready(unicode) < 0) { + Py_XDECREF(unicode); + return NULL; + } +#else + /* don't make the result ready in debug mode to ensure that the caller + makes the string ready before using it */ + assert(_PyUnicode_CheckConsistency(unicode, 1)); +#endif + return unicode; +} + +static PyObject* +unicode_result_ready(PyObject *unicode) +{ + Py_ssize_t length; + + length = PyUnicode_GET_LENGTH(unicode); + if (length == 0) { + if (unicode != unicode_empty) { - Py_INCREF(unicode_empty); + Py_DECREF(unicode); ++ _Py_RETURN_UNICODE_EMPTY(); + } + return unicode_empty; + } + + if (length == 1) { + Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); + if (ch < 256) { + PyObject *latin1_char = unicode_latin1[ch]; + if (latin1_char != NULL) { + if (unicode != latin1_char) { + Py_INCREF(latin1_char); + Py_DECREF(unicode); + } + return latin1_char; + } + else { + assert(_PyUnicode_CheckConsistency(unicode, 1)); + Py_INCREF(unicode); + unicode_latin1[ch] = unicode; + return unicode; + } + } + } + + assert(_PyUnicode_CheckConsistency(unicode, 1)); + return unicode; +} + +static PyObject* +unicode_result(PyObject *unicode) +{ + assert(_PyUnicode_CHECK(unicode)); + if (PyUnicode_IS_READY(unicode)) + return unicode_result_ready(unicode); + else + return unicode_result_wchar(unicode); +} + +static PyObject* +unicode_result_unchanged(PyObject *unicode) +{ + if (PyUnicode_CheckExact(unicode)) { + if (PyUnicode_READY(unicode) == -1) + return NULL; + Py_INCREF(unicode); + return unicode; + } + else + /* Subtype -- return genuine unicode string with the same value. */ + return _PyUnicode_Copy(unicode); +} + +#ifdef HAVE_MBCS +static OSVERSIONINFOEX winver; +#endif + /* --- Bloom Filters ----------------------------------------------------- */ /* stuff to implement simple "bloom filters" for Unicode characters. @@@ -1515,103 -418,36 +1534,105 @@@ unicode_dealloc(register PyObject *unic Py_FatalError("Inconsistent interned string state."); } - if (PyUnicode_CheckExact(unicode) && - numfree < PyUnicode_MAXFREELIST) { - /* Keep-Alive optimization */ - if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { - PyObject_DEL(unicode->str); - unicode->str = NULL; - unicode->length = 0; - } - if (unicode->defenc) { - Py_CLEAR(unicode->defenc); - } - /* Add to free list */ - *(PyUnicodeObject **)unicode = free_list; - free_list = unicode; - numfree++; - } - else { - PyObject_DEL(unicode->str); - Py_XDECREF(unicode->defenc); - Py_TYPE(unicode)->tp_free((PyObject *)unicode); + if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) + PyObject_DEL(_PyUnicode_WSTR(unicode)); + if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) + PyObject_DEL(_PyUnicode_UTF8(unicode)); + if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) + PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); + + Py_TYPE(unicode)->tp_free(unicode); +} + +#ifdef Py_DEBUG +static int +unicode_is_singleton(PyObject *unicode) +{ + PyASCIIObject *ascii = (PyASCIIObject *)unicode; + if (unicode == unicode_empty) + return 1; + if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) + { + Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); + if (ch < 256 && unicode_latin1[ch] == unicode) + return 1; } + return 0; } +#endif -static -int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length) +static int +unicode_modifiable(PyObject *unicode) { - register PyUnicodeObject *v; + assert(_PyUnicode_CHECK(unicode)); + if (Py_REFCNT(unicode) != 1) + return 0; + if (_PyUnicode_HASH(unicode) != -1) + return 0; + if (PyUnicode_CHECK_INTERNED(unicode)) + return 0; + if (!PyUnicode_CheckExact(unicode)) + return 0; +#ifdef Py_DEBUG + /* singleton refcount is greater than 1 */ + assert(!unicode_is_singleton(unicode)); +#endif + return 1; +} - /* Argument checks */ - if (unicode == NULL) { +static int +unicode_resize(PyObject **p_unicode, Py_ssize_t length) +{ + PyObject *unicode; + Py_ssize_t old_length; + + assert(p_unicode != NULL); + unicode = *p_unicode; + + assert(unicode != NULL); + assert(PyUnicode_Check(unicode)); + assert(0 <= length); + + if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) + old_length = PyUnicode_WSTR_LENGTH(unicode); + else + old_length = PyUnicode_GET_LENGTH(unicode); + if (old_length == length) + return 0; + + if (length == 0) { ++ _Py_INCREF_UNICODE_EMPTY(); ++ if (!unicode_empty) ++ return -1; + Py_DECREF(*p_unicode); + *p_unicode = unicode_empty; - Py_INCREF(*p_unicode); + return 0; + } + + if (!unicode_modifiable(unicode)) { + PyObject *copy = resize_copy(unicode, length); + if (copy == NULL) + return -1; + Py_DECREF(*p_unicode); + *p_unicode = copy; + return 0; + } + + if (PyUnicode_IS_COMPACT(unicode)) { + PyObject *new_unicode = resize_compact(unicode, length); + if (new_unicode == NULL) + return -1; + *p_unicode = new_unicode; + return 0; + } + return resize_inplace(unicode, length); +} + +int +PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) +{ + PyObject *unicode; + if (p_unicode == NULL) { PyErr_BadInternalCall(); return -1; } @@@ -1729,530 -489,165 +1750,520 @@@ PyUnicode_FromUnicode(const Py_UNICODE /* If the Unicode data is known at construction time, we can apply some optimizations which share commonly used objects. */ - if (u != NULL) { - /* Optimization for empty strings */ - if (size == 0) - _Py_RETURN_UNICODE_EMPTY(); + /* Optimization for empty strings */ - if (size == 0 && unicode_empty != NULL) { - Py_INCREF(unicode_empty); - return unicode_empty; - } ++ if (size == 0) ++ _Py_RETURN_UNICODE_EMPTY(); - /* Single character Unicode objects in the Latin-1 range are - shared when using this constructor */ - if (size == 1 && *u < 256) { - unicode = unicode_latin1[*u]; - if (!unicode) { - unicode = _PyUnicode_New(1); - if (!unicode) - return NULL; - unicode->str[0] = *u; - unicode_latin1[*u] = unicode; - } - Py_INCREF(unicode); - return (PyObject *)unicode; - } - } + /* Single character Unicode objects in the Latin-1 range are + shared when using this constructor */ + if (size == 1 && *u < 256) + return get_latin1_char((unsigned char)*u); + + /* If not empty and not single character, copy the Unicode data + into the new object */ + if (find_maxchar_surrogates(u, u + size, + &maxchar, &num_surrogates) == -1) + return NULL; - unicode = _PyUnicode_New(size); + unicode = PyUnicode_New(size - num_surrogates, maxchar); if (!unicode) return NULL; - /* Copy the Unicode data into the new object */ + switch (PyUnicode_KIND(unicode)) { + case PyUnicode_1BYTE_KIND: + _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, + u, u + size, PyUnicode_1BYTE_DATA(unicode)); + break; + case PyUnicode_2BYTE_KIND: +#if Py_UNICODE_SIZE == 2 + Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2); +#else + _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, + u, u + size, PyUnicode_2BYTE_DATA(unicode)); +#endif + break; + case PyUnicode_4BYTE_KIND: +#if SIZEOF_WCHAR_T == 2 + /* This is the only case which has to process surrogates, thus + a simple copy loop is not enough and we need a function. */ + unicode_convert_wchar_to_ucs4(u, u + size, unicode); +#else + assert(num_surrogates == 0); + Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4); +#endif + break; + default: + assert(0 && "Impossible state"); + } + + return unicode_result(unicode); +} + +PyObject * +PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) +{ + if (size < 0) { + PyErr_SetString(PyExc_SystemError, + "Negative size passed to PyUnicode_FromStringAndSize"); + return NULL; + } if (u != NULL) - Py_UNICODE_COPY(unicode->str, u, size); + return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL); + else + return (PyObject *)_PyUnicode_New(size); +} - return (PyObject *)unicode; +PyObject * +PyUnicode_FromString(const char *u) +{ + size_t size = strlen(u); + if (size > PY_SSIZE_T_MAX) { + PyErr_SetString(PyExc_OverflowError, "input too long"); + return NULL; + } + return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL); +} + +PyObject * +_PyUnicode_FromId(_Py_Identifier *id) +{ + if (!id->object) { + id->object = PyUnicode_DecodeUTF8Stateful(id->string, + strlen(id->string), + NULL, NULL); + if (!id->object) + return NULL; + PyUnicode_InternInPlace(&id->object); + assert(!id->next); + id->next = static_strings; + static_strings = id; + } + return id->object; +} + +void +_PyUnicode_ClearStaticStrings() +{ + _Py_Identifier *tmp, *s = static_strings; + while (s) { + Py_DECREF(s->object); + s->object = NULL; + tmp = s->next; + s->next = NULL; + s = tmp; + } + static_strings = NULL; +} + +/* Internal function, doesn't check maximum character */ + +PyObject* +_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size) +{ + const unsigned char *s = (const unsigned char *)buffer; + PyObject *unicode; + if (size == 1) { +#ifdef Py_DEBUG + assert(s[0] < 128); +#endif + return get_latin1_char(s[0]); + } + unicode = PyUnicode_New(size, 127); + if (!unicode) + return NULL; + memcpy(PyUnicode_1BYTE_DATA(unicode), s, size); + assert(_PyUnicode_CheckConsistency(unicode, 1)); + return unicode; +} + +static Py_UCS4 +kind_maxchar_limit(unsigned int kind) +{ + switch (kind) { + case PyUnicode_1BYTE_KIND: + return 0x80; + case PyUnicode_2BYTE_KIND: + return 0x100; + case PyUnicode_4BYTE_KIND: + return 0x10000; + default: + assert(0 && "invalid kind"); + return MAX_UNICODE; + } +} + +Py_LOCAL_INLINE(Py_UCS4) +align_maxchar(Py_UCS4 maxchar) +{ + if (maxchar <= 127) + return 127; + else if (maxchar <= 255) + return 255; + else if (maxchar <= 65535) + return 65535; + else + return MAX_UNICODE; +} + +static PyObject* +_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size) +{ + PyObject *res; + unsigned char max_char; + - if (size == 0) { - Py_INCREF(unicode_empty); - return unicode_empty; - } ++ if (size == 0) ++ _Py_RETURN_UNICODE_EMPTY(); + assert(size > 0); + if (size == 1) + return get_latin1_char(u[0]); + + max_char = ucs1lib_find_max_char(u, u + size); + res = PyUnicode_New(size, max_char); + if (!res) + return NULL; + memcpy(PyUnicode_1BYTE_DATA(res), u, size); + assert(_PyUnicode_CheckConsistency(res, 1)); + return res; +} + +static PyObject* +_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) +{ + PyObject *res; + Py_UCS2 max_char; + - if (size == 0) { - Py_INCREF(unicode_empty); - return unicode_empty; - } ++ if (size == 0) ++ _Py_RETURN_UNICODE_EMPTY(); + assert(size > 0); + if (size == 1) { + Py_UCS4 ch = u[0]; + if (ch < 256) + return get_latin1_char((unsigned char)ch); + + res = PyUnicode_New(1, ch); + if (res == NULL) + return NULL; + PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch); + assert(_PyUnicode_CheckConsistency(res, 1)); + return res; + } + + max_char = ucs2lib_find_max_char(u, u + size); + res = PyUnicode_New(size, max_char); + if (!res) + return NULL; + if (max_char >= 256) + memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); + else { + _PyUnicode_CONVERT_BYTES( + Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res)); + } + assert(_PyUnicode_CheckConsistency(res, 1)); + return res; +} + +static PyObject* +_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) +{ + PyObject *res; + Py_UCS4 max_char; + - if (size == 0) { - Py_INCREF(unicode_empty); - return unicode_empty; - } ++ if (size == 0) ++ _Py_RETURN_UNICODE_EMPTY(); + assert(size > 0); + if (size == 1) { + Py_UCS4 ch = u[0]; + if (ch < 256) + return get_latin1_char((unsigned char)ch); + + res = PyUnicode_New(1, ch); + if (res == NULL) + return NULL; + PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch); + assert(_PyUnicode_CheckConsistency(res, 1)); + return res; + } + + max_char = ucs4lib_find_max_char(u, u + size); + res = PyUnicode_New(size, max_char); + if (!res) + return NULL; + if (max_char < 256) + _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size, + PyUnicode_1BYTE_DATA(res)); + else if (max_char < 0x10000) + _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size, + PyUnicode_2BYTE_DATA(res)); + else + memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); + assert(_PyUnicode_CheckConsistency(res, 1)); + return res; } -PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) +PyObject* +PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) { - PyUnicodeObject *unicode; - if (size < 0) { - PyErr_SetString(PyExc_SystemError, - "Negative size passed to PyUnicode_FromStringAndSize"); + PyErr_SetString(PyExc_ValueError, "size must be positive"); + return NULL; + } + switch (kind) { + case PyUnicode_1BYTE_KIND: + return _PyUnicode_FromUCS1(buffer, size); + case PyUnicode_2BYTE_KIND: + return _PyUnicode_FromUCS2(buffer, size); + case PyUnicode_4BYTE_KIND: + return _PyUnicode_FromUCS4(buffer, size); + default: + PyErr_SetString(PyExc_SystemError, "invalid kind"); return NULL; } +} - /* If the Unicode data is known at construction time, we can apply - some optimizations which share commonly used objects. - Also, this means the input must be UTF-8, so fall back to the - UTF-8 decoder at the end. */ - if (u != NULL) { +Py_UCS4 +_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end) +{ + enum PyUnicode_Kind kind; + void *startptr, *endptr; - /* Optimization for empty strings */ - if (size == 0) - _Py_RETURN_UNICODE_EMPTY(); + assert(PyUnicode_IS_READY(unicode)); + assert(0 <= start); + assert(end <= PyUnicode_GET_LENGTH(unicode)); + assert(start <= end); - /* Single characters are shared when using this constructor. - Restrict to ASCII, since the input must be UTF-8. */ - if (size == 1 && Py_CHARMASK(*u) < 128) { - unicode = unicode_latin1[Py_CHARMASK(*u)]; - if (!unicode) { - unicode = _PyUnicode_New(1); - if (!unicode) - return NULL; - unicode->str[0] = Py_CHARMASK(*u); - unicode_latin1[Py_CHARMASK(*u)] = unicode; - } - Py_INCREF(unicode); - return (PyObject *)unicode; - } + if (start == 0 && end == PyUnicode_GET_LENGTH(unicode)) + return PyUnicode_MAX_CHAR_VALUE(unicode); - return PyUnicode_DecodeUTF8(u, size, NULL); + if (start == end) + return 127; + + if (PyUnicode_IS_ASCII(unicode)) + return 127; + + kind = PyUnicode_KIND(unicode); + startptr = PyUnicode_DATA(unicode); + endptr = (char *)startptr + end * kind; + startptr = (char *)startptr + start * kind; + switch(kind) { + case PyUnicode_1BYTE_KIND: + return ucs1lib_find_max_char(startptr, endptr); + case PyUnicode_2BYTE_KIND: + return ucs2lib_find_max_char(startptr, endptr); + case PyUnicode_4BYTE_KIND: + return ucs4lib_find_max_char(startptr, endptr); + default: + assert(0); + return 0; } +} - unicode = _PyUnicode_New(size); - if (!unicode) - return NULL; +/* Ensure that a string uses the most efficient storage, if it is not the + case: create a new string with of the right kind. Write NULL into *p_unicode + on error. */ +static void +unicode_adjust_maxchar(PyObject **p_unicode) +{ + PyObject *unicode, *copy; + Py_UCS4 max_char; + Py_ssize_t len; + unsigned int kind; + + assert(p_unicode != NULL); + unicode = *p_unicode; + assert(PyUnicode_IS_READY(unicode)); + if (PyUnicode_IS_ASCII(unicode)) + return; - return (PyObject *)unicode; + len = PyUnicode_GET_LENGTH(unicode); + kind = PyUnicode_KIND(unicode); + if (kind == PyUnicode_1BYTE_KIND) { + const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); + max_char = ucs1lib_find_max_char(u, u + len); + if (max_char >= 128) + return; + } + else if (kind == PyUnicode_2BYTE_KIND) { + const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); + max_char = ucs2lib_find_max_char(u, u + len); + if (max_char >= 256) + return; + } + else { + const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode); + assert(kind == PyUnicode_4BYTE_KIND); + max_char = ucs4lib_find_max_char(u, u + len); + if (max_char >= 0x10000) + return; + } + copy = PyUnicode_New(len, max_char); + if (copy != NULL) + _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len); + Py_DECREF(unicode); + *p_unicode = copy; } -PyObject *PyUnicode_FromString(const char *u) +PyObject* +_PyUnicode_Copy(PyObject *unicode) { - size_t size = strlen(u); - if (size > PY_SSIZE_T_MAX) { - PyErr_SetString(PyExc_OverflowError, "input too long"); + Py_ssize_t length; + PyObject *copy; + + if (!PyUnicode_Check(unicode)) { + PyErr_BadInternalCall(); return NULL; } + if (PyUnicode_READY(unicode) == -1) + return NULL; - return PyUnicode_FromStringAndSize(u, size); -} - -#ifdef HAVE_WCHAR_H + length = PyUnicode_GET_LENGTH(unicode); + copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); + if (!copy) + return NULL; + assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); -#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4) -# define CONVERT_WCHAR_TO_SURROGATES -#endif + Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode), + length * PyUnicode_KIND(unicode)); + assert(_PyUnicode_CheckConsistency(copy, 1)); + return copy; +} -#ifdef CONVERT_WCHAR_TO_SURROGATES -/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need - to convert from UTF32 to UTF16. */ +/* Widen Unicode objects to larger buffers. Don't write terminating null + character. Return NULL on error. */ -PyObject *PyUnicode_FromWideChar(register const wchar_t *w, - Py_ssize_t size) +void* +_PyUnicode_AsKind(PyObject *s, unsigned int kind) { - PyUnicodeObject *unicode; - register Py_ssize_t i; - Py_ssize_t alloc; - const wchar_t *orig_w; + Py_ssize_t len; + void *result; + unsigned int skind; - if (w == NULL) { - if (size == 0) - return PyUnicode_FromStringAndSize(NULL, 0); - PyErr_BadInternalCall(); + if (PyUnicode_READY(s) == -1) return NULL; - } - if (size == -1) { - size = wcslen(w); + len = PyUnicode_GET_LENGTH(s); + skind = PyUnicode_KIND(s); + if (skind >= kind) { + PyErr_SetString(PyExc_SystemError, "invalid widening attempt"); + return NULL; } - - alloc = size; - orig_w = w; - for (i = size; i > 0; i--) { - if (*w > 0xFFFF) - alloc++; - w++; + switch (kind) { + case PyUnicode_2BYTE_KIND: + result = PyMem_Malloc(len * sizeof(Py_UCS2)); + if (!result) + return PyErr_NoMemory(); + assert(skind == PyUnicode_1BYTE_KIND); + _PyUnicode_CONVERT_BYTES( + Py_UCS1, Py_UCS2, + PyUnicode_1BYTE_DATA(s), + PyUnicode_1BYTE_DATA(s) + len, + result); + return result; + case PyUnicode_4BYTE_KIND: + result = PyMem_Malloc(len * sizeof(Py_UCS4)); + if (!result) + return PyErr_NoMemory(); + if (skind == PyUnicode_2BYTE_KIND) { + _PyUnicode_CONVERT_BYTES( + Py_UCS2, Py_UCS4, + PyUnicode_2BYTE_DATA(s), + PyUnicode_2BYTE_DATA(s) + len, + result); + } + else { + assert(skind == PyUnicode_1BYTE_KIND); + _PyUnicode_CONVERT_BYTES( + Py_UCS1, Py_UCS4, + PyUnicode_1BYTE_DATA(s), + PyUnicode_1BYTE_DATA(s) + len, + result); + } + return result; + default: + break; } - w = orig_w; - unicode = _PyUnicode_New(alloc); - if (!unicode) - return NULL; + PyErr_SetString(PyExc_SystemError, "invalid kind"); + return NULL; +} - /* Copy the wchar_t data into the new object */ - { - register Py_UNICODE *u; - u = PyUnicode_AS_UNICODE(unicode); - for (i = size; i > 0; i--) { - if (*w > 0xFFFF) { - wchar_t ordinal = *w++; - ordinal -= 0x10000; - *u++ = 0xD800 | (ordinal >> 10); - *u++ = 0xDC00 | (ordinal & 0x3FF); - } - else - *u++ = *w++; +static Py_UCS4* +as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, + int copy_null) +{ + int kind; + void *data; + Py_ssize_t len, targetlen; + if (PyUnicode_READY(string) == -1) + return NULL; + kind = PyUnicode_KIND(string); + data = PyUnicode_DATA(string); + len = PyUnicode_GET_LENGTH(string); + targetlen = len; + if (copy_null) + targetlen++; + if (!target) { + if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) { + PyErr_NoMemory(); + return NULL; + } + target = PyMem_Malloc(targetlen * sizeof(Py_UCS4)); + if (!target) { + PyErr_NoMemory(); + return NULL; } } - return (PyObject *)unicode; + else { + if (targetsize < targetlen) { + PyErr_Format(PyExc_SystemError, + "string is longer than the buffer"); + if (copy_null && 0 < targetsize) + target[0] = 0; + return NULL; + } + } + if (kind == PyUnicode_1BYTE_KIND) { + Py_UCS1 *start = (Py_UCS1 *) data; + _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target); + } + else if (kind == PyUnicode_2BYTE_KIND) { + Py_UCS2 *start = (Py_UCS2 *) data; + _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target); + } + else { + assert(kind == PyUnicode_4BYTE_KIND); + Py_MEMCPY(target, data, len * sizeof(Py_UCS4)); + } + if (copy_null) + target[len] = 0; + return target; } -#else +Py_UCS4* +PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, + int copy_null) +{ + if (target == NULL || targetsize < 0) { + PyErr_BadInternalCall(); + return NULL; + } + return as_ucs4(string, target, targetsize, copy_null); +} -PyObject *PyUnicode_FromWideChar(register const wchar_t *w, - Py_ssize_t size) +Py_UCS4* +PyUnicode_AsUCS4Copy(PyObject *string) { - PyUnicodeObject *unicode; + return as_ucs4(string, NULL, 0, 1); +} +#ifdef HAVE_WCHAR_H + +PyObject * +PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) +{ if (w == NULL) { - if (size == 0) { - Py_INCREF(unicode_empty); - return unicode_empty; - } + if (size == 0) - return PyUnicode_FromStringAndSize(NULL, 0); ++ _Py_RETURN_UNICODE_EMPTY(); PyErr_BadInternalCall(); return NULL; } @@@ -4720,68 -2590,181 +4726,67 @@@ PyUnicode_DecodeUTF8Stateful(const cha if (size == 0) { if (consumed) *consumed = 0; - Py_INCREF(unicode_empty); - return unicode_empty; - return (PyObject *)unicode; ++ _Py_RETURN_UNICODE_EMPTY(); } - /* Unpack UTF-8 encoded data */ - p = unicode->str; - e = s + size; - aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); - - while (s < e) { - Py_UCS4 ch = (unsigned char)*s; - - if (ch < 0x80) { - /* Fast path for runs of ASCII characters. Given that common UTF-8 - input will consist of an overwhelming majority of ASCII - characters, we try to optimize for this case by checking - as many characters as a C 'long' can contain. - First, check if we can do an aligned read, as most CPUs have - a penalty for unaligned reads. - */ - if (!((size_t) s & LONG_PTR_MASK)) { - /* Help register allocation */ - register const char *_s = s; - register Py_UNICODE *_p = p; - while (_s < aligned_end) { - /* Read a whole long at a time (either 4 or 8 bytes), - and do a fast unrolled copy if it only contains ASCII - characters. */ - unsigned long data = *(unsigned long *) _s; - if (data & ASCII_CHAR_MASK) - break; - _p[0] = (unsigned char) _s[0]; - _p[1] = (unsigned char) _s[1]; - _p[2] = (unsigned char) _s[2]; - _p[3] = (unsigned char) _s[3]; -#if (SIZEOF_LONG == 8) - _p[4] = (unsigned char) _s[4]; - _p[5] = (unsigned char) _s[5]; - _p[6] = (unsigned char) _s[6]; - _p[7] = (unsigned char) _s[7]; -#endif - _s += SIZEOF_LONG; - _p += SIZEOF_LONG; - } - s = _s; - p = _p; - if (s == e) - break; - ch = (unsigned char)*s; - } - } - - if (ch < 0x80) { - *p++ = (Py_UNICODE)ch; - s++; - continue; - } + /* ASCII is equivalent to the first 128 ordinals in Unicode. */ + if (size == 1 && (unsigned char)s[0] < 128) { + if (consumed) + *consumed = 1; + return get_latin1_char((unsigned char)s[0]); + } - n = utf8_code_length[ch]; + unicode = PyUnicode_New(size, 127); + if (!unicode) + return NULL; - if (s + n > e) { - if (consumed) - break; - else { - errmsg = "unexpected end of data"; - startinpos = s-starts; - endinpos = startinpos+1; - for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++) - endinpos++; - goto utf8Error; - } + outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode)); + s += outpos; + while (s < end) { + Py_UCS4 ch; + int kind = PyUnicode_KIND(unicode); + if (kind == PyUnicode_1BYTE_KIND) { + if (PyUnicode_IS_ASCII(unicode)) + ch = asciilib_utf8_decode(&s, end, + PyUnicode_1BYTE_DATA(unicode), &outpos); + else + ch = ucs1lib_utf8_decode(&s, end, + PyUnicode_1BYTE_DATA(unicode), &outpos); + } else if (kind == PyUnicode_2BYTE_KIND) { + ch = ucs2lib_utf8_decode(&s, end, + PyUnicode_2BYTE_DATA(unicode), &outpos); + } else { + assert(kind == PyUnicode_4BYTE_KIND); + ch = ucs4lib_utf8_decode(&s, end, + PyUnicode_4BYTE_DATA(unicode), &outpos); } - switch (n) { - + switch (ch) { case 0: - errmsg = "invalid start byte"; - startinpos = s-starts; - endinpos = startinpos+1; - goto utf8Error; - + if (s == end || consumed) + goto End; + errmsg = "unexpected end of data"; + startinpos = s - starts; + endinpos = end - starts; + break; case 1: - errmsg = "internal error"; - startinpos = s-starts; - endinpos = startinpos+1; - goto utf8Error; - - case 2: - if ((s[1] & 0xc0) != 0x80) { - errmsg = "invalid continuation byte"; - startinpos = s-starts; - endinpos = startinpos + 1; - goto utf8Error; - } - ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); - assert ((ch > 0x007F) && (ch <= 0x07FF)); - *p++ = (Py_UNICODE)ch; + errmsg = "invalid start byte"; + startinpos = s - starts; + endinpos = startinpos + 1; break; - + case 2: case 3: - /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf - will result in surrogates in range d800-dfff. Surrogates are - not valid UTF-8 so they are rejected. - See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf - (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ - if ((s[1] & 0xc0) != 0x80 || - (s[2] & 0xc0) != 0x80 || - ((unsigned char)s[0] == 0xE0 && - (unsigned char)s[1] < 0xA0) || - ((unsigned char)s[0] == 0xED && - (unsigned char)s[1] > 0x9F)) { - errmsg = "invalid continuation byte"; - startinpos = s-starts; - endinpos = startinpos + 1; - - /* if s[1] first two bits are 1 and 0, then the invalid - continuation byte is s[2], so increment endinpos by 1, - if not, s[1] is invalid and endinpos doesn't need to - be incremented. */ - if ((s[1] & 0xC0) == 0x80) - endinpos++; - goto utf8Error; - } - ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); - assert ((ch > 0x07FF) && (ch <= 0xFFFF)); - *p++ = (Py_UNICODE)ch; - break; - case 4: - if ((s[1] & 0xc0) != 0x80 || - (s[2] & 0xc0) != 0x80 || - (s[3] & 0xc0) != 0x80 || - ((unsigned char)s[0] == 0xF0 && - (unsigned char)s[1] < 0x90) || - ((unsigned char)s[0] == 0xF4 && - (unsigned char)s[1] > 0x8F)) { - errmsg = "invalid continuation byte"; - startinpos = s-starts; - endinpos = startinpos + 1; - if ((s[1] & 0xC0) == 0x80) { - endinpos++; - if ((s[2] & 0xC0) == 0x80) - endinpos++; - } - goto utf8Error; - } - ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + - ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); - assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); - -#ifdef Py_UNICODE_WIDE - *p++ = (Py_UNICODE)ch; -#else - /* compute and append the two surrogates: */ - - /* translate from 10000..10FFFF to 0..FFFF */ - ch -= 0x10000; - - /* high surrogate = top 10 bits added to D800 */ - *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); - - /* low surrogate = bottom 10 bits added to DC00 */ - *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); -#endif + errmsg = "invalid continuation byte"; + startinpos = s - starts; + endinpos = startinpos + ch - 1; break; + default: + if (unicode_putchar(&unicode, &outpos, ch) < 0) + goto onError; + continue; } - s += n; - continue; - utf8Error: - outpos = p-PyUnicode_AS_UNICODE(unicode); if (unicode_decode_call_errorhandler( errors, &errorHandler, "utf-8", errmsg, @@@ -5215,71 -3439,54 +5220,70 @@@ PyUnicode_DecodeUTF16Stateful(const cha byte order setting accordingly. In native mode, the leading BOM mark is skipped, in all other modes, it is copied to the output stream as-is (giving a ZWNBSP character). */ - if (bo == 0) { - if (size >= 2) { - const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; -#ifdef BYTEORDER_IS_LITTLE_ENDIAN - if (bom == 0xFEFF) { - q += 2; - bo = -1; - } - else if (bom == 0xFFFE) { - q += 2; - bo = 1; - } -#else - if (bom == 0xFEFF) { - q += 2; - bo = 1; - } - else if (bom == 0xFFFE) { - q += 2; - bo = -1; - } -#endif + if (bo == 0 && size >= 2) { + const Py_UCS4 bom = (q[1] << 8) | q[0]; + if (bom == 0xFEFF) { + q += 2; + bo = -1; + } + else if (bom == 0xFFFE) { + q += 2; + bo = 1; } + if (byteorder) + *byteorder = bo; } - if (bo == -1) { - /* force LE */ - ihi = 1; - ilo = 0; - } - else if (bo == 1) { - /* force BE */ - ihi = 0; - ilo = 1; + if (q == e) { + if (consumed) + *consumed = size; - Py_INCREF(unicode_empty); - return unicode_empty; ++ _Py_RETURN_UNICODE_EMPTY(); } + #ifdef BYTEORDER_IS_LITTLE_ENDIAN - native_ordering = ilo < ihi; + native_ordering = bo <= 0; #else - native_ordering = ilo > ihi; + native_ordering = bo >= 0; #endif - aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK); + /* Note: size will always be longer than the resulting Unicode + character count */ + unicode = PyUnicode_New((e - q + 1) / 2, 127); + if (!unicode) + return NULL; + + outpos = 0; while (1) { - Py_UNICODE ch; - if (e - q < 2) { + Py_UCS4 ch = 0; + if (e - q >= 2) { + int kind = PyUnicode_KIND(unicode); + if (kind == PyUnicode_1BYTE_KIND) { + if (PyUnicode_IS_ASCII(unicode)) + ch = asciilib_utf16_decode(&q, e, + PyUnicode_1BYTE_DATA(unicode), &outpos, + native_ordering); + else + ch = ucs1lib_utf16_decode(&q, e, + PyUnicode_1BYTE_DATA(unicode), &outpos, + native_ordering); + } else if (kind == PyUnicode_2BYTE_KIND) { + ch = ucs2lib_utf16_decode(&q, e, + PyUnicode_2BYTE_DATA(unicode), &outpos, + native_ordering); + } else { + assert(kind == PyUnicode_4BYTE_KIND); + ch = ucs4lib_utf16_decode(&q, e, + PyUnicode_4BYTE_DATA(unicode), &outpos, + native_ordering); + } + } + + switch (ch) + { + case 0: /* remaining byte at the end? (size should be even) */ if (q == e || consumed) - break; + goto End; errmsg = "truncated data"; startinpos = ((const char *)q) - starts; endinpos = ((const char *)e) - starts; @@@ -6558,27 -4831,19 +6562,25 @@@ PyUnicode_DecodeASCII(const char *s PyObject *errorHandler = NULL; PyObject *exc = NULL; - if (size == 0) { - Py_INCREF(unicode_empty); - return unicode_empty; - } ++ if (size == 0) ++ _Py_RETURN_UNICODE_EMPTY(); + /* ASCII is equivalent to the first 128 ordinals in Unicode. */ - if (size == 1 && *(unsigned char*)s < 128) { - Py_UNICODE r = *(unsigned char*)s; - return PyUnicode_FromUnicode(&r, 1); - } + if (size == 1 && (unsigned char)s[0] < 128) + return get_latin1_char((unsigned char)s[0]); - v = _PyUnicode_New(size); - if (v == NULL) + unicode = PyUnicode_New(size, 127); + if (unicode == NULL) goto onError; - if (size == 0) - return (PyObject *)v; - p = PyUnicode_AS_UNICODE(v); + e = s + size; + data = PyUnicode_1BYTE_DATA(unicode); + outpos = ascii_decode(s, e, (Py_UCS1 *)data); + if (outpos == size) + return unicode; + + s += outpos; + kind = PyUnicode_1BYTE_KIND; while (s < e) { register unsigned char c = (unsigned char)*s; if (c < 128) { @@@ -6658,365 -4902,20 +6660,364 @@@ PyUnicode_AsASCIIString(PyObject *unico #define NEED_RETRY #endif -/* XXX This code is limited to "true" double-byte encodings, as - a) it assumes an incomplete character consists of a single byte, and - b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte - encodings, see IsDBCSLeadByteEx documentation. */ +#ifndef WC_ERR_INVALID_CHARS +# define WC_ERR_INVALID_CHARS 0x0080 +#endif + +static char* +code_page_name(UINT code_page, PyObject **obj) +{ + *obj = NULL; + if (code_page == CP_ACP) + return "mbcs"; + if (code_page == CP_UTF7) + return "CP_UTF7"; + if (code_page == CP_UTF8) + return "CP_UTF8"; + + *obj = PyBytes_FromFormat("cp%u", code_page); + if (*obj == NULL) + return NULL; + return PyBytes_AS_STRING(*obj); +} + +static int +is_dbcs_lead_byte(UINT code_page, const char *s, int offset) +{ + const char *curr = s + offset; + const char *prev; + + if (!IsDBCSLeadByteEx(code_page, *curr)) + return 0; + + prev = CharPrevExA(code_page, s, curr, 0); + if (prev == curr) + return 1; + /* FIXME: This code is limited to "true" double-byte encodings, + as it assumes an incomplete character consists of a single + byte. */ + if (curr - prev == 2) + return 1; + if (!IsDBCSLeadByteEx(code_page, *prev)) + return 1; + return 0; +} + +static DWORD +decode_code_page_flags(UINT code_page) +{ + if (code_page == CP_UTF7) { + /* The CP_UTF7 decoder only supports flags=0 */ + return 0; + } + else + return MB_ERR_INVALID_CHARS; +} + +/* + * Decode a byte string from a Windows code page into unicode object in strict + * mode. + * + * Returns consumed size if succeed, returns -2 on decode error, or raise a + * WindowsError and returns -1 on other error. + */ +static int +decode_code_page_strict(UINT code_page, + PyObject **v, + const char *in, + int insize) +{ + const DWORD flags = decode_code_page_flags(code_page); + wchar_t *out; + DWORD outsize; + + /* First get the size of the result */ + assert(insize > 0); + outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0); + if (outsize <= 0) + goto error; + + if (*v == NULL) { + /* Create unicode object */ + /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ + *v = (PyObject*)_PyUnicode_New(outsize); + if (*v == NULL) + return -1; + out = PyUnicode_AS_UNICODE(*v); + } + else { + /* Extend unicode object */ + Py_ssize_t n = PyUnicode_GET_SIZE(*v); + if (unicode_resize(v, n + outsize) < 0) + return -1; + out = PyUnicode_AS_UNICODE(*v) + n; + } + + /* Do the conversion */ + outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize); + if (outsize <= 0) + goto error; + return insize; + +error: + if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) + return -2; + PyErr_SetFromWindowsErr(0); + return -1; +} + +/* + * Decode a byte string from a code page into unicode object with an error + * handler. + * + * Returns consumed size if succeed, or raise a WindowsError or + * UnicodeDecodeError exception and returns -1 on error. + */ +static int +decode_code_page_errors(UINT code_page, + PyObject **v, + const char *in, const int size, + const char *errors) +{ + const char *startin = in; + const char *endin = in + size; + const DWORD flags = decode_code_page_flags(code_page); + /* Ideally, we should get reason from FormatMessage. This is the Windows + 2000 English version of the message. */ + const char *reason = "No mapping for the Unicode character exists " + "in the target code page."; + /* each step cannot decode more than 1 character, but a character can be + represented as a surrogate pair */ + wchar_t buffer[2], *startout, *out; + int insize, outsize; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + PyObject *encoding_obj = NULL; + char *encoding; + DWORD err; + int ret = -1; + + assert(size > 0); + + encoding = code_page_name(code_page, &encoding_obj); + if (encoding == NULL) + return -1; + + if (errors == NULL || strcmp(errors, "strict") == 0) { + /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a + UnicodeDecodeError. */ + make_decode_exception(&exc, encoding, in, size, 0, 0, reason); + if (exc != NULL) { + PyCodec_StrictErrors(exc); + Py_CLEAR(exc); + } + goto error; + } + + if (*v == NULL) { + /* Create unicode object */ + if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { + PyErr_NoMemory(); + goto error; + } + /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ + *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer)); + if (*v == NULL) + goto error; + startout = PyUnicode_AS_UNICODE(*v); + } + else { + /* Extend unicode object */ + Py_ssize_t n = PyUnicode_GET_SIZE(*v); + if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { + PyErr_NoMemory(); + goto error; + } + if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0) + goto error; + startout = PyUnicode_AS_UNICODE(*v) + n; + } + + /* Decode the byte string character per character */ + out = startout; + while (in < endin) + { + /* Decode a character */ + insize = 1; + do + { + outsize = MultiByteToWideChar(code_page, flags, + in, insize, + buffer, Py_ARRAY_LENGTH(buffer)); + if (outsize > 0) + break; + err = GetLastError(); + if (err != ERROR_NO_UNICODE_TRANSLATION + && err != ERROR_INSUFFICIENT_BUFFER) + { + PyErr_SetFromWindowsErr(0); + goto error; + } + insize++; + } + /* 4=maximum length of a UTF-8 sequence */ + while (insize <= 4 && (in + insize) <= endin); + + if (outsize <= 0) { + Py_ssize_t startinpos, endinpos, outpos; + + startinpos = in - startin; + endinpos = startinpos + 1; + outpos = out - PyUnicode_AS_UNICODE(*v); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + encoding, reason, + &startin, &endin, &startinpos, &endinpos, &exc, &in, + v, &outpos)) + { + goto error; + } + out = PyUnicode_AS_UNICODE(*v) + outpos; + } + else { + in += insize; + memcpy(out, buffer, outsize * sizeof(wchar_t)); + out += outsize; + } + } + + /* write a NUL character at the end */ + *out = 0; + + /* Extend unicode object */ + outsize = out - startout; + assert(outsize <= PyUnicode_WSTR_LENGTH(*v)); + if (unicode_resize(v, outsize) < 0) + goto error; + ret = size; + +error: + Py_XDECREF(encoding_obj); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + return ret; +} + +static PyObject * +decode_code_page_stateful(int code_page, + const char *s, Py_ssize_t size, + const char *errors, Py_ssize_t *consumed) +{ + PyObject *v = NULL; + int chunk_size, final, converted, done; + + if (code_page < 0) { + PyErr_SetString(PyExc_ValueError, "invalid code page number"); + return NULL; + } + + if (consumed) + *consumed = 0; + + do + { +#ifdef NEED_RETRY + if (size > INT_MAX) { + chunk_size = INT_MAX; + final = 0; + done = 0; + } + else +#endif + { + chunk_size = (int)size; + final = (consumed == NULL); + done = 1; + } + + /* Skip trailing lead-byte unless 'final' is set */ + if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1)) + --chunk_size; + + if (chunk_size == 0 && done) { + if (v != NULL) + break; - Py_INCREF(unicode_empty); - return unicode_empty; ++ _Py_RETURN_UNICODE_EMPTY(); + } + + + converted = decode_code_page_strict(code_page, &v, + s, chunk_size); + if (converted == -2) + converted = decode_code_page_errors(code_page, &v, + s, chunk_size, + errors); + assert(converted != 0); + + if (converted < 0) { + Py_XDECREF(v); + return NULL; + } + + if (consumed) + *consumed += converted; + + s += converted; + size -= converted; + } while (!done); + + return unicode_result(v); +} + +PyObject * +PyUnicode_DecodeCodePageStateful(int code_page, + const char *s, + Py_ssize_t size, + const char *errors, + Py_ssize_t *consumed) +{ + return decode_code_page_stateful(code_page, s, size, errors, consumed); +} + +PyObject * +PyUnicode_DecodeMBCSStateful(const char *s, + Py_ssize_t size, + const char *errors, + Py_ssize_t *consumed) +{ + return decode_code_page_stateful(CP_ACP, s, size, errors, consumed); +} -static int is_dbcs_lead_byte(const char *s, int offset) +PyObject * +PyUnicode_DecodeMBCS(const char *s, + Py_ssize_t size, + const char *errors) { - const char *curr = s + offset; + return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); +} - if (IsDBCSLeadByte(*curr)) { - const char *prev = CharPrev(s, curr); - return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); +static DWORD +encode_code_page_flags(UINT code_page, const char *errors) +{ + if (code_page == CP_UTF8) { + if (winver.dwMajorVersion >= 6) + /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista + and later */ + return WC_ERR_INVALID_CHARS; + else + /* CP_UTF8 only supports flags=0 on Windows older than Vista */ + return 0; + } + else if (code_page == CP_UTF7) { + /* CP_UTF7 only supports flags=0 */ + return 0; + } + else { + if (errors != NULL && strcmp(errors, "replace") == 0) + return 0; + else + return WC_NO_BEST_FIT_CHARS; } - return 0; } /* @@@ -9502,24 -6803,18 +9503,22 @@@ PyUnicode_Join(PyObject *separator, PyO seqlen = PySequence_Fast_GET_SIZE(fseq); /* If empty sequence, return u"". */ if (seqlen == 0) { - res = _PyUnicode_New(0); /* empty sequence; return u"" */ - goto Done; + Py_DECREF(fseq); - Py_INCREF(unicode_empty); - res = unicode_empty; - return res; ++ _Py_RETURN_UNICODE_EMPTY(); } - items = PySequence_Fast_ITEMS(fseq); + /* If singleton sequence with an exact Unicode, return that. */ + last_obj = NULL; + items = PySequence_Fast_ITEMS(fseq); if (seqlen == 1) { - item = items[0]; - if (PyUnicode_CheckExact(item)) { - Py_INCREF(item); - res = (PyUnicodeObject *)item; - goto Done; + if (PyUnicode_CheckExact(items[0])) { + res = items[0]; + Py_INCREF(res); + Py_DECREF(fseq); + return res; } + seplen = 0; + maxchar = 0; } else { /* Set up sep and seplen */ @@@ -10052,180 -6991,96 +10051,182 @@@ replace(PyObject *self, PyObject *str1 if (maxcount < 0) maxcount = PY_SSIZE_T_MAX; - else if (maxcount == 0 || self->length == 0) + else if (maxcount == 0 || slen == 0) goto nothing; - if (str1->length == str2->length) { - Py_ssize_t i; + if (str1 == str2) + goto nothing; + if (skind < kind1) + /* substring too wide to be present */ + goto nothing; + + maxchar = PyUnicode_MAX_CHAR_VALUE(self); + maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2); + /* Replacing str1 with str2 may cause a maxchar reduction in the + result string. */ + mayshrink = (maxchar_str2 < maxchar); + maxchar = MAX_MAXCHAR(maxchar, maxchar_str2); + + if (len1 == len2) { /* same length */ - if (str1->length == 0) + if (len1 == 0) goto nothing; - if (str1->length == 1) { + if (len1 == 1) { /* replace characters */ - Py_UNICODE u1, u2; - if (!findchar(self->str, self->length, str1->str[0])) + Py_UCS4 u1, u2; + int rkind; + Py_ssize_t index, pos; + char *src; + + u1 = PyUnicode_READ_CHAR(str1, 0); + pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1); + if (pos < 0) goto nothing; - u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); + u2 = PyUnicode_READ_CHAR(str2, 0); + u = PyUnicode_New(slen, maxchar); if (!u) - return NULL; - Py_UNICODE_COPY(u->str, self->str, self->length); - u1 = str1->str[0]; - u2 = str2->str[0]; - for (i = 0; i < u->length; i++) - if (u->str[i] == u1) { - if (--maxcount < 0) - break; - u->str[i] = u2; - } - } else { - i = stringlib_find( - self->str, self->length, str1->str, str1->length, 0 - ); + goto error; + _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen); + rkind = PyUnicode_KIND(u); + + PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2); + index = 0; + src = sbuf; + while (--maxcount) + { + pos++; + src += pos * PyUnicode_KIND(self); + slen -= pos; + index += pos; + pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1); + if (pos < 0) + break; + PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2); + } + } + else { + int rkind = skind; + char *res; + Py_ssize_t i; + + if (kind1 < rkind) { + /* widen substring */ + buf1 = _PyUnicode_AsKind(str1, rkind); + if (!buf1) goto error; + release1 = 1; + } + i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0); if (i < 0) goto nothing; - u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); + if (rkind > kind2) { + /* widen replacement */ + buf2 = _PyUnicode_AsKind(str2, rkind); + if (!buf2) goto error; + release2 = 1; + } + else if (rkind < kind2) { + /* widen self and buf1 */ + rkind = kind2; + if (release1) PyMem_Free(buf1); + release1 = 0; + sbuf = _PyUnicode_AsKind(self, rkind); + if (!sbuf) goto error; + srelease = 1; + buf1 = _PyUnicode_AsKind(str1, rkind); + if (!buf1) goto error; + release1 = 1; + } + u = PyUnicode_New(slen, maxchar); if (!u) - return NULL; - Py_UNICODE_COPY(u->str, self->str, self->length); + goto error; + assert(PyUnicode_KIND(u) == rkind); + res = PyUnicode_DATA(u); + memcpy(res, sbuf, rkind * slen); /* change everything in-place, starting with this one */ - Py_UNICODE_COPY(u->str+i, str2->str, str2->length); - i += str1->length; + memcpy(res + rkind * i, + buf2, + rkind * len2); + i += len1; while ( --maxcount > 0) { - i = stringlib_find(self->str+i, self->length-i, - str1->str, str1->length, - i); + i = anylib_find(rkind, self, + sbuf+rkind*i, slen-i, + str1, buf1, len1, i); if (i == -1) break; - Py_UNICODE_COPY(u->str+i, str2->str, str2->length); - i += str1->length; + memcpy(res + rkind * i, + buf2, + rkind * len2); + i += len1; } } - } else { - - Py_ssize_t n, i, j; - Py_ssize_t product, new_size, delta; - Py_UNICODE *p; - - /* replace strings */ - n = stringlib_count(self->str, self->length, str1->str, str1->length, - maxcount); + } + else { + Py_ssize_t n, i, j, ires; + Py_ssize_t new_size; + int rkind = skind; + char *res; + + if (kind1 < rkind) { + /* widen substring */ + buf1 = _PyUnicode_AsKind(str1, rkind); + if (!buf1) goto error; + release1 = 1; + } + n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount); if (n == 0) goto nothing; - /* new_size = self->length + n * (str2->length - str1->length)); */ - delta = (str2->length - str1->length); - if (delta == 0) { - new_size = self->length; - } else { - product = n * (str2->length - str1->length); - if ((product / (str2->length - str1->length)) != n) { - PyErr_SetString(PyExc_OverflowError, - "replace string is too long"); - return NULL; - } - new_size = self->length + product; - if (new_size < 0) { + if (kind2 < rkind) { + /* widen replacement */ + buf2 = _PyUnicode_AsKind(str2, rkind); + if (!buf2) goto error; + release2 = 1; + } + else if (kind2 > rkind) { + /* widen self and buf1 */ + rkind = kind2; + sbuf = _PyUnicode_AsKind(self, rkind); + if (!sbuf) goto error; + srelease = 1; + if (release1) PyMem_Free(buf1); + release1 = 0; + buf1 = _PyUnicode_AsKind(str1, rkind); + if (!buf1) goto error; + release1 = 1; + } + /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - + PyUnicode_GET_LENGTH(str1))); */ + if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) { PyErr_SetString(PyExc_OverflowError, "replace string is too long"); - return NULL; - } + goto error; } - u = _PyUnicode_New(new_size); + new_size = slen + n * (len2 - len1); + if (new_size == 0) { - Py_INCREF(unicode_empty); ++ _Py_INCREF_UNICODE_EMPTY(); ++ if (!unicode_empty) ++ goto error; + u = unicode_empty; + goto done; + } + if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) { + PyErr_SetString(PyExc_OverflowError, + "replace string is too long"); + goto error; + } + u = PyUnicode_New(new_size, maxchar); if (!u) - return NULL; - i = 0; - p = u->str; - if (str1->length > 0) { + goto error; + assert(PyUnicode_KIND(u) == rkind); + res = PyUnicode_DATA(u); + ires = i = 0; + if (len1 > 0) { while (n-- > 0) { /* look for next match */ - j = stringlib_find(self->str+i, self->length-i, - str1->str, str1->length, - i); + j = anylib_find(rkind, self, + sbuf + rkind * i, slen-i, + str1, buf1, len1, i); if (j == -1) break; else if (j > i) { @@@ -11649,61 -8208,20 +11650,59 @@@ _PyUnicode_XStrip(PyObject *self, int s j++; } - if (i == 0 && j == len && PyUnicode_CheckExact(self)) { - Py_INCREF(self); - return (PyObject*)self; - } - else - return PyUnicode_FromUnicode(s+i, j-i); + return PyUnicode_Substring(self, i, j); } +PyObject* +PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) +{ + unsigned char *data; + int kind; + Py_ssize_t length; + + if (PyUnicode_READY(self) == -1) + return NULL; + + length = PyUnicode_GET_LENGTH(self); + end = Py_MIN(end, length); + + if (start == 0 && end == length) + return unicode_result_unchanged(self); + + if (start < 0 || end < 0) { + PyErr_SetString(PyExc_IndexError, "string index out of range"); + return NULL; + } - if (start >= length || end < start) { - Py_INCREF(unicode_empty); - return unicode_empty; - } ++ if (start >= length || end < start) ++ _Py_RETURN_UNICODE_EMPTY(); + + length = end - start; + if (PyUnicode_IS_ASCII(self)) { + data = PyUnicode_1BYTE_DATA(self); + return _PyUnicode_FromASCII((char*)(data + start), length); + } + else { + kind = PyUnicode_KIND(self); + data = PyUnicode_1BYTE_DATA(self); + return PyUnicode_FromKindAndData(kind, + data + kind * start, + length); + } +} static PyObject * -do_strip(PyUnicodeObject *self, int striptype) +do_strip(PyObject *self, int striptype) { - Py_UNICODE *s = PyUnicode_AS_UNICODE(self); - Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j; + int kind; + void *data; + Py_ssize_t len, i, j; + + if (PyUnicode_READY(self) == -1) + return NULL; + + kind = PyUnicode_KIND(self); + data = PyUnicode_DATA(self); + len = PyUnicode_GET_LENGTH(self); i = 0; if (striptype != RIGHTSTRIP) { @@@ -11797,24 -8320,33 +11796,22 @@@ unicode_rstrip(PyObject *self, PyObjec static PyObject* -unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) +unicode_repeat(PyObject *str, Py_ssize_t len) { - PyUnicodeObject *u; - Py_UNICODE *p; - Py_ssize_t nchars; - size_t nbytes; + PyObject *u; + Py_ssize_t nchars, n; - if (len < 1) { - Py_INCREF(unicode_empty); - return unicode_empty; - } + if (len < 1) + _Py_RETURN_UNICODE_EMPTY(); - if (len == 1 && PyUnicode_CheckExact(str)) { - /* no repeat, return original string */ - Py_INCREF(str); - return (PyObject*) str; - } + /* no repeat, return original string */ + if (len == 1) + return unicode_result_unchanged(str); - /* ensure # of chars needed doesn't overflow int and # of bytes - * needed doesn't overflow size_t - */ - nchars = len * str->length; - if (nchars / len != str->length) { - PyErr_SetString(PyExc_OverflowError, - "repeated string is too long"); + if (PyUnicode_READY(str) == -1) return NULL; - } - nbytes = (nchars + 1) * sizeof(Py_UNICODE); - if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { + + if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { PyErr_SetString(PyExc_OverflowError, "repeated string is too long"); return NULL; @@@ -12797,160 -9186,7 +12794,159 @@@ unicode_endswith(PyObject *self return PyBool_FromLong(result); } -#include "stringlib/string_format.h" +Py_LOCAL_INLINE(void) +_PyUnicodeWriter_Update(_PyUnicodeWriter *writer) +{ + writer->size = PyUnicode_GET_LENGTH(writer->buffer); + writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); + writer->data = PyUnicode_DATA(writer->buffer); + writer->kind = PyUnicode_KIND(writer->buffer); +} + +void +_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length) +{ + memset(writer, 0, sizeof(*writer)); +#ifdef Py_DEBUG + writer->kind = 5; /* invalid kind */ +#endif + writer->min_length = Py_MAX(min_length, 100); + writer->overallocate = (min_length > 0); +} + +int +_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, + Py_ssize_t length, Py_UCS4 maxchar) +{ + Py_ssize_t newlen; + PyObject *newbuffer; + + assert(length > 0); + + if (length > PY_SSIZE_T_MAX - writer->pos) { + PyErr_NoMemory(); + return -1; + } + newlen = writer->pos + length; + + if (writer->buffer == NULL) { + if (writer->overallocate) { + /* overallocate 25% to limit the number of resize */ + if (newlen <= (PY_SSIZE_T_MAX - newlen / 4)) + newlen += newlen / 4; + if (newlen < writer->min_length) + newlen = writer->min_length; + } + writer->buffer = PyUnicode_New(newlen, maxchar); + if (writer->buffer == NULL) + return -1; + _PyUnicodeWriter_Update(writer); + return 0; + } + + if (newlen > writer->size) { + if (writer->overallocate) { + /* overallocate 25% to limit the number of resize */ + if (newlen <= (PY_SSIZE_T_MAX - newlen / 4)) + newlen += newlen / 4; + if (newlen < writer->min_length) + newlen = writer->min_length; + } + + if (maxchar > writer->maxchar || writer->readonly) { + /* resize + widen */ + newbuffer = PyUnicode_New(newlen, maxchar); + if (newbuffer == NULL) + return -1; + _PyUnicode_FastCopyCharacters(newbuffer, 0, + writer->buffer, 0, writer->pos); + Py_DECREF(writer->buffer); + writer->readonly = 0; + } + else { + newbuffer = resize_compact(writer->buffer, newlen); + if (newbuffer == NULL) + return -1; + } + writer->buffer = newbuffer; + _PyUnicodeWriter_Update(writer); + } + else if (maxchar > writer->maxchar) { + assert(!writer->readonly); + newbuffer = PyUnicode_New(writer->size, maxchar); + if (newbuffer == NULL) + return -1; + _PyUnicode_FastCopyCharacters(newbuffer, 0, + writer->buffer, 0, writer->pos); + Py_DECREF(writer->buffer); + writer->buffer = newbuffer; + _PyUnicodeWriter_Update(writer); + } + return 0; +} + +int +_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) +{ + Py_UCS4 maxchar; + Py_ssize_t len; + + if (PyUnicode_READY(str) == -1) + return -1; + len = PyUnicode_GET_LENGTH(str); + if (len == 0) + return 0; + maxchar = PyUnicode_MAX_CHAR_VALUE(str); + if (maxchar > writer->maxchar || len > writer->size - writer->pos) { + if (writer->buffer == NULL && !writer->overallocate) { + Py_INCREF(str); + writer->buffer = str; + _PyUnicodeWriter_Update(writer); + writer->readonly = 1; + writer->size = 0; + writer->pos += len; + return 0; + } + if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1) + return -1; + } + _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, + str, 0, len); + writer->pos += len; + return 0; +} + +PyObject * +_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) +{ + if (writer->pos == 0) { + Py_XDECREF(writer->buffer); - Py_INCREF(unicode_empty); - return unicode_empty; ++ _Py_RETURN_UNICODE_EMPTY(); + } + if (writer->readonly) { + assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos); + return writer->buffer; + } + if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) { + PyObject *newbuffer; + newbuffer = resize_compact(writer->buffer, writer->pos); + if (newbuffer == NULL) { + Py_DECREF(writer->buffer); + return NULL; + } + writer->buffer = newbuffer; + } + assert(_PyUnicode_CheckConsistency(writer->buffer, 1)); + return writer->buffer; +} + +void +_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer) +{ + Py_CLEAR(writer->buffer); +} + +#include "stringlib/unicode_format.h" PyDoc_STRVAR(format__doc__, "S.format(*args, **kwargs) -> str\n\ @@@ -13143,44 -9345,29 +13139,43 @@@ unicode_subscript(PyObject* self, PyObj } if (slicelength <= 0) { - Py_INCREF(unicode_empty); - return unicode_empty; - return PyUnicode_FromUnicode(NULL, 0); - } else if (start == 0 && step == 1 && slicelength == self->length && - PyUnicode_CheckExact(self)) { - Py_INCREF(self); - return (PyObject *)self; ++ _Py_RETURN_UNICODE_EMPTY(); + } else if (start == 0 && step == 1 && + slicelength == PyUnicode_GET_LENGTH(self)) { + return unicode_result_unchanged(self); } else if (step == 1) { - return PyUnicode_FromUnicode(self->str + start, slicelength); - } else { - source_buf = PyUnicode_AS_UNICODE((PyObject*)self); - result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength* - sizeof(Py_UNICODE)); - - if (result_buf == NULL) - return PyErr_NoMemory(); - + return PyUnicode_Substring(self, + start, start + slicelength); + } + /* General case */ + src_kind = PyUnicode_KIND(self); + src_data = PyUnicode_DATA(self); + if (!PyUnicode_IS_ASCII(self)) { + kind_limit = kind_maxchar_limit(src_kind); + max_char = 0; for (cur = start, i = 0; i < slicelength; cur += step, i++) { - result_buf[i] = source_buf[cur]; + ch = PyUnicode_READ(src_kind, src_data, cur); + if (ch > max_char) { + max_char = ch; + if (max_char >= kind_limit) + break; + } } + } + else + max_char = 127; + result = PyUnicode_New(slicelength, max_char); + if (result == NULL) + return NULL; + dest_kind = PyUnicode_KIND(result); + dest_data = PyUnicode_DATA(result); - result = PyUnicode_FromUnicode(result_buf, slicelength); - PyObject_FREE(result_buf); - return result; + for (cur = start, i = 0; i < slicelength; cur += step, i++) { + Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur); + PyUnicode_WRITE(dest_kind, dest_data, i, ch); } + assert(_PyUnicode_CheckConsistency(result, 1)); + return result; } else { PyErr_SetString(PyExc_TypeError, "string indices must be integers"); return NULL; @@@ -13974,10 -9958,8 +13969,8 @@@ unicode_new(PyTypeObject *type, PyObjec if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", kwlist, &x, &encoding, &errors)) return NULL; - if (x == NULL) { - Py_INCREF(unicode_empty); - return unicode_empty; - } + if (x == NULL) - return (PyObject *)_PyUnicode_New(0); ++ _Py_RETURN_UNICODE_EMPTY(); if (encoding == NULL && errors == NULL) return PyObject_Str(x); else @@@ -14144,12 -10056,10 +14137,10 @@@ PyTypeObject PyUnicode_Type = /* Initialize the Unicode implementation */ -void _PyUnicode_Init(void) +int _PyUnicode_Init(void) { - int i; - /* XXX - move this array to unicodectype.c ? */ - Py_UNICODE linebreak[] = { + Py_UCS2 linebreak[] = { 0x000A, /* LINE FEED */ 0x000D, /* CARRIAGE RETURN */ 0x001C, /* FILE SEPARATOR */ @@@ -14161,13 -10071,12 +14152,11 @@@ }; /* Init the implementation */ - unicode_empty = PyUnicode_New(0, 0); - if (!unicode_empty) { - unicode_empty = _PyUnicode_New(0); - if (!unicode_empty) - return; - } ++ _Py_INCREF_UNICODE_EMPTY(); + if (!unicode_empty) + Py_FatalError("Can't create empty string"); - assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); ++ Py_DECREF(unicode_empty); - for (i = 0; i < 256; i++) - unicode_latin1[i] = NULL; if (PyType_Ready(&PyUnicode_Type) < 0) Py_FatalError("Can't initialize 'unicode'"); @@@ -14207,16 -10121,11 +14196,11 @@@ _PyUnicode_Fini(void { int i; - Py_XDECREF(unicode_empty); - unicode_empty = NULL; + Py_CLEAR(unicode_empty); - for (i = 0; i < 256; i++) { - if (unicode_latin1[i]) { - Py_DECREF(unicode_latin1[i]); - unicode_latin1[i] = NULL; - } - } + for (i = 0; i < 256; i++) + Py_CLEAR(unicode_latin1[i]); - + _PyUnicode_ClearStaticStrings(); (void)PyUnicode_ClearFreeList(); }