Another way to look at this is that to say that the actual reference
count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
*/
- static PyObject *interned;
+ static PyObject *interned = NULL;
-/* Free list for Unicode objects */
-static PyUnicodeObject *free_list = NULL;
-static int numfree = 0;
-
/* The empty Unicode object is shared to improve performance. */
- static PyObject *unicode_empty;
-static PyUnicodeObject *unicode_empty = NULL;
++static PyObject *unicode_empty = NULL;
+
-#define _Py_RETURN_UNICODE_EMPTY() \
++#define _Py_INCREF_UNICODE_EMPTY() \
+ do { \
+ if (unicode_empty != NULL) \
+ Py_INCREF(unicode_empty); \
+ else { \
- unicode_empty = _PyUnicode_New(0); \
- if (unicode_empty != NULL) \
++ unicode_empty = PyUnicode_New(0, 0); \
++ if (unicode_empty != NULL) { \
+ Py_INCREF(unicode_empty); \
++ assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
++ } \
+ } \
- return (PyObject *)unicode_empty; \
+ } while (0)
+
++#define _Py_RETURN_UNICODE_EMPTY() \
++ do { \
++ _Py_INCREF_UNICODE_EMPTY(); \
++ return unicode_empty; \
++ } while (0)
+
+/* List of static strings. */
- static _Py_Identifier *static_strings;
++static _Py_Identifier *static_strings = NULL;
+
/* Single character Unicode strings in the Latin-1 range are being
shared as well. */
- static PyObject *unicode_latin1[256];
-static PyUnicodeObject *unicode_latin1[256] = {NULL};
++static PyObject *unicode_latin1[256] = {NULL};
/* Fast detection of the most frequent whitespace characters */
const unsigned char _Py_ascii_whitespace[] = {
#endif
}
- Py_INCREF(unicode_empty);
+#ifdef Py_DEBUG
+int
+_PyUnicode_CheckConsistency(PyObject *op, int check_content)
+{
+ PyASCIIObject *ascii;
+ unsigned int kind;
+
+ assert(PyUnicode_Check(op));
+
+ ascii = (PyASCIIObject *)op;
+ kind = ascii->state.kind;
+
+ if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
+ assert(kind == PyUnicode_1BYTE_KIND);
+ assert(ascii->state.ready == 1);
+ }
+ else {
+ PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
+ void *data;
+
+ if (ascii->state.compact == 1) {
+ data = compact + 1;
+ assert(kind == PyUnicode_1BYTE_KIND
+ || kind == PyUnicode_2BYTE_KIND
+ || kind == PyUnicode_4BYTE_KIND);
+ assert(ascii->state.ascii == 0);
+ assert(ascii->state.ready == 1);
+ assert (compact->utf8 != data);
+ }
+ else {
+ PyUnicodeObject *unicode = (PyUnicodeObject *)op;
+
+ data = unicode->data.any;
+ if (kind == PyUnicode_WCHAR_KIND) {
+ assert(ascii->length == 0);
+ assert(ascii->hash == -1);
+ assert(ascii->state.compact == 0);
+ assert(ascii->state.ascii == 0);
+ assert(ascii->state.ready == 0);
+ assert(ascii->state.interned == SSTATE_NOT_INTERNED);
+ assert(ascii->wstr != NULL);
+ assert(data == NULL);
+ assert(compact->utf8 == NULL);
+ }
+ else {
+ assert(kind == PyUnicode_1BYTE_KIND
+ || kind == PyUnicode_2BYTE_KIND
+ || kind == PyUnicode_4BYTE_KIND);
+ assert(ascii->state.compact == 0);
+ assert(ascii->state.ready == 1);
+ assert(data != NULL);
+ if (ascii->state.ascii) {
+ assert (compact->utf8 == data);
+ assert (compact->utf8_length == ascii->length);
+ }
+ else
+ assert (compact->utf8 != data);
+ }
+ }
+ if (kind != PyUnicode_WCHAR_KIND) {
+ if (
+#if SIZEOF_WCHAR_T == 2
+ kind == PyUnicode_2BYTE_KIND
+#else
+ kind == PyUnicode_4BYTE_KIND
+#endif
+ )
+ {
+ assert(ascii->wstr == data);
+ assert(compact->wstr_length == ascii->length);
+ } else
+ assert(ascii->wstr != data);
+ }
+
+ if (compact->utf8 == NULL)
+ assert(compact->utf8_length == 0);
+ if (ascii->wstr == NULL)
+ assert(compact->wstr_length == 0);
+ }
+ /* check that the best kind is used */
+ if (check_content && kind != PyUnicode_WCHAR_KIND)
+ {
+ Py_ssize_t i;
+ Py_UCS4 maxchar = 0;
+ void *data;
+ Py_UCS4 ch;
+
+ data = PyUnicode_DATA(ascii);
+ for (i=0; i < ascii->length; i++)
+ {
+ ch = PyUnicode_READ(kind, data, i);
+ if (ch > maxchar)
+ maxchar = ch;
+ }
+ if (kind == PyUnicode_1BYTE_KIND) {
+ if (ascii->state.ascii == 0) {
+ assert(maxchar >= 128);
+ assert(maxchar <= 255);
+ }
+ else
+ assert(maxchar < 128);
+ }
+ else if (kind == PyUnicode_2BYTE_KIND) {
+ assert(maxchar >= 0x100);
+ assert(maxchar <= 0xFFFF);
+ }
+ else {
+ assert(maxchar >= 0x10000);
+ assert(maxchar <= MAX_UNICODE);
+ }
+ assert(PyUnicode_READ(kind, data, ascii->length) == 0);
+ }
+ return 1;
+}
+#endif
+
+static PyObject*
+unicode_result_wchar(PyObject *unicode)
+{
+#ifndef Py_DEBUG
+ Py_ssize_t len;
+
+ assert(Py_REFCNT(unicode) == 1);
+
+ len = _PyUnicode_WSTR_LENGTH(unicode);
+ if (len == 0) {
- return unicode_empty;
+ Py_DECREF(unicode);
- Py_INCREF(unicode_empty);
++ _Py_RETURN_UNICODE_EMPTY();
+ }
+
+ if (len == 1) {
+ wchar_t ch = _PyUnicode_WSTR(unicode)[0];
+ if (ch < 256) {
+ PyObject *latin1_char = get_latin1_char((unsigned char)ch);
+ Py_DECREF(unicode);
+ return latin1_char;
+ }
+ }
+
+ if (_PyUnicode_Ready(unicode) < 0) {
+ Py_XDECREF(unicode);
+ return NULL;
+ }
+#else
+ /* don't make the result ready in debug mode to ensure that the caller
+ makes the string ready before using it */
+ assert(_PyUnicode_CheckConsistency(unicode, 1));
+#endif
+ return unicode;
+}
+
+static PyObject*
+unicode_result_ready(PyObject *unicode)
+{
+ Py_ssize_t length;
+
+ length = PyUnicode_GET_LENGTH(unicode);
+ if (length == 0) {
+ if (unicode != unicode_empty) {
+ Py_DECREF(unicode);
++ _Py_RETURN_UNICODE_EMPTY();
+ }
+ return unicode_empty;
+ }
+
+ if (length == 1) {
+ Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
+ if (ch < 256) {
+ PyObject *latin1_char = unicode_latin1[ch];
+ if (latin1_char != NULL) {
+ if (unicode != latin1_char) {
+ Py_INCREF(latin1_char);
+ Py_DECREF(unicode);
+ }
+ return latin1_char;
+ }
+ else {
+ assert(_PyUnicode_CheckConsistency(unicode, 1));
+ Py_INCREF(unicode);
+ unicode_latin1[ch] = unicode;
+ return unicode;
+ }
+ }
+ }
+
+ assert(_PyUnicode_CheckConsistency(unicode, 1));
+ return unicode;
+}
+
+static PyObject*
+unicode_result(PyObject *unicode)
+{
+ assert(_PyUnicode_CHECK(unicode));
+ if (PyUnicode_IS_READY(unicode))
+ return unicode_result_ready(unicode);
+ else
+ return unicode_result_wchar(unicode);
+}
+
+static PyObject*
+unicode_result_unchanged(PyObject *unicode)
+{
+ if (PyUnicode_CheckExact(unicode)) {
+ if (PyUnicode_READY(unicode) == -1)
+ return NULL;
+ Py_INCREF(unicode);
+ return unicode;
+ }
+ else
+ /* Subtype -- return genuine unicode string with the same value. */
+ return _PyUnicode_Copy(unicode);
+}
+
+#ifdef HAVE_MBCS
+static OSVERSIONINFOEX winver;
+#endif
+
/* --- Bloom Filters ----------------------------------------------------- */
/* stuff to implement simple "bloom filters" for Unicode characters.
Py_FatalError("Inconsistent interned string state.");
}
- if (PyUnicode_CheckExact(unicode) &&
- numfree < PyUnicode_MAXFREELIST) {
- /* Keep-Alive optimization */
- if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
- PyObject_DEL(unicode->str);
- unicode->str = NULL;
- unicode->length = 0;
- }
- if (unicode->defenc) {
- Py_CLEAR(unicode->defenc);
- }
- /* Add to free list */
- *(PyUnicodeObject **)unicode = free_list;
- free_list = unicode;
- numfree++;
- }
- else {
- PyObject_DEL(unicode->str);
- Py_XDECREF(unicode->defenc);
- Py_TYPE(unicode)->tp_free((PyObject *)unicode);
+ if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
+ PyObject_DEL(_PyUnicode_WSTR(unicode));
+ if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
+ PyObject_DEL(_PyUnicode_UTF8(unicode));
+ if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
+ PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
+
+ Py_TYPE(unicode)->tp_free(unicode);
+}
+
+#ifdef Py_DEBUG
+static int
+unicode_is_singleton(PyObject *unicode)
+{
+ PyASCIIObject *ascii = (PyASCIIObject *)unicode;
+ if (unicode == unicode_empty)
+ return 1;
+ if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
+ {
+ Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
+ if (ch < 256 && unicode_latin1[ch] == unicode)
+ return 1;
}
+ return 0;
}
+#endif
-static
-int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
+static int
+unicode_modifiable(PyObject *unicode)
{
- register PyUnicodeObject *v;
+ assert(_PyUnicode_CHECK(unicode));
+ if (Py_REFCNT(unicode) != 1)
+ return 0;
+ if (_PyUnicode_HASH(unicode) != -1)
+ return 0;
+ if (PyUnicode_CHECK_INTERNED(unicode))
+ return 0;
+ if (!PyUnicode_CheckExact(unicode))
+ return 0;
+#ifdef Py_DEBUG
+ /* singleton refcount is greater than 1 */
+ assert(!unicode_is_singleton(unicode));
+#endif
+ return 1;
+}
- /* Argument checks */
- if (unicode == NULL) {
+static int
+unicode_resize(PyObject **p_unicode, Py_ssize_t length)
+{
+ PyObject *unicode;
+ Py_ssize_t old_length;
+
+ assert(p_unicode != NULL);
+ unicode = *p_unicode;
+
+ assert(unicode != NULL);
+ assert(PyUnicode_Check(unicode));
+ assert(0 <= length);
+
+ if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
+ old_length = PyUnicode_WSTR_LENGTH(unicode);
+ else
+ old_length = PyUnicode_GET_LENGTH(unicode);
+ if (old_length == length)
+ return 0;
+
+ if (length == 0) {
++ _Py_INCREF_UNICODE_EMPTY();
++ if (!unicode_empty)
++ return -1;
+ Py_DECREF(*p_unicode);
+ *p_unicode = unicode_empty;
- Py_INCREF(*p_unicode);
+ return 0;
+ }
+
+ if (!unicode_modifiable(unicode)) {
+ PyObject *copy = resize_copy(unicode, length);
+ if (copy == NULL)
+ return -1;
+ Py_DECREF(*p_unicode);
+ *p_unicode = copy;
+ return 0;
+ }
+
+ if (PyUnicode_IS_COMPACT(unicode)) {
+ PyObject *new_unicode = resize_compact(unicode, length);
+ if (new_unicode == NULL)
+ return -1;
+ *p_unicode = new_unicode;
+ return 0;
+ }
+ return resize_inplace(unicode, length);
+}
+
+int
+PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
+{
+ PyObject *unicode;
+ if (p_unicode == NULL) {
PyErr_BadInternalCall();
return -1;
}
/* If the Unicode data is known at construction time, we can apply
some optimizations which share commonly used objects. */
- if (u != NULL) {
- /* Optimization for empty strings */
- if (size == 0)
- _Py_RETURN_UNICODE_EMPTY();
+ /* Optimization for empty strings */
- if (size == 0 && unicode_empty != NULL) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
++ if (size == 0)
++ _Py_RETURN_UNICODE_EMPTY();
- /* Single character Unicode objects in the Latin-1 range are
- shared when using this constructor */
- if (size == 1 && *u < 256) {
- unicode = unicode_latin1[*u];
- if (!unicode) {
- unicode = _PyUnicode_New(1);
- if (!unicode)
- return NULL;
- unicode->str[0] = *u;
- unicode_latin1[*u] = unicode;
- }
- Py_INCREF(unicode);
- return (PyObject *)unicode;
- }
- }
+ /* Single character Unicode objects in the Latin-1 range are
+ shared when using this constructor */
+ if (size == 1 && *u < 256)
+ return get_latin1_char((unsigned char)*u);
+
+ /* If not empty and not single character, copy the Unicode data
+ into the new object */
+ if (find_maxchar_surrogates(u, u + size,
+ &maxchar, &num_surrogates) == -1)
+ return NULL;
- unicode = _PyUnicode_New(size);
+ unicode = PyUnicode_New(size - num_surrogates, maxchar);
if (!unicode)
return NULL;
- /* Copy the Unicode data into the new object */
+ switch (PyUnicode_KIND(unicode)) {
+ case PyUnicode_1BYTE_KIND:
+ _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
+ u, u + size, PyUnicode_1BYTE_DATA(unicode));
+ break;
+ case PyUnicode_2BYTE_KIND:
+#if Py_UNICODE_SIZE == 2
+ Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
+#else
+ _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
+ u, u + size, PyUnicode_2BYTE_DATA(unicode));
+#endif
+ break;
+ case PyUnicode_4BYTE_KIND:
+#if SIZEOF_WCHAR_T == 2
+ /* This is the only case which has to process surrogates, thus
+ a simple copy loop is not enough and we need a function. */
+ unicode_convert_wchar_to_ucs4(u, u + size, unicode);
+#else
+ assert(num_surrogates == 0);
+ Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
+#endif
+ break;
+ default:
+ assert(0 && "Impossible state");
+ }
+
+ return unicode_result(unicode);
+}
+
+PyObject *
+PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
+{
+ if (size < 0) {
+ PyErr_SetString(PyExc_SystemError,
+ "Negative size passed to PyUnicode_FromStringAndSize");
+ return NULL;
+ }
if (u != NULL)
- Py_UNICODE_COPY(unicode->str, u, size);
+ return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
+ else
+ return (PyObject *)_PyUnicode_New(size);
+}
- return (PyObject *)unicode;
+PyObject *
+PyUnicode_FromString(const char *u)
+{
+ size_t size = strlen(u);
+ if (size > PY_SSIZE_T_MAX) {
+ PyErr_SetString(PyExc_OverflowError, "input too long");
+ return NULL;
+ }
+ return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
+}
+
+PyObject *
+_PyUnicode_FromId(_Py_Identifier *id)
+{
+ if (!id->object) {
+ id->object = PyUnicode_DecodeUTF8Stateful(id->string,
+ strlen(id->string),
+ NULL, NULL);
+ if (!id->object)
+ return NULL;
+ PyUnicode_InternInPlace(&id->object);
+ assert(!id->next);
+ id->next = static_strings;
+ static_strings = id;
+ }
+ return id->object;
+}
+
+void
+_PyUnicode_ClearStaticStrings()
+{
+ _Py_Identifier *tmp, *s = static_strings;
+ while (s) {
+ Py_DECREF(s->object);
+ s->object = NULL;
+ tmp = s->next;
+ s->next = NULL;
+ s = tmp;
+ }
+ static_strings = NULL;
+}
+
+/* Internal function, doesn't check maximum character */
+
+PyObject*
+_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
+{
+ const unsigned char *s = (const unsigned char *)buffer;
+ PyObject *unicode;
+ if (size == 1) {
+#ifdef Py_DEBUG
+ assert(s[0] < 128);
+#endif
+ return get_latin1_char(s[0]);
+ }
+ unicode = PyUnicode_New(size, 127);
+ if (!unicode)
+ return NULL;
+ memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
+ assert(_PyUnicode_CheckConsistency(unicode, 1));
+ return unicode;
+}
+
+static Py_UCS4
+kind_maxchar_limit(unsigned int kind)
+{
+ switch (kind) {
+ case PyUnicode_1BYTE_KIND:
+ return 0x80;
+ case PyUnicode_2BYTE_KIND:
+ return 0x100;
+ case PyUnicode_4BYTE_KIND:
+ return 0x10000;
+ default:
+ assert(0 && "invalid kind");
+ return MAX_UNICODE;
+ }
+}
+
+Py_LOCAL_INLINE(Py_UCS4)
+align_maxchar(Py_UCS4 maxchar)
+{
+ if (maxchar <= 127)
+ return 127;
+ else if (maxchar <= 255)
+ return 255;
+ else if (maxchar <= 65535)
+ return 65535;
+ else
+ return MAX_UNICODE;
+}
+
+static PyObject*
+_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
+{
+ PyObject *res;
+ unsigned char max_char;
+
- if (size == 0) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
++ if (size == 0)
++ _Py_RETURN_UNICODE_EMPTY();
+ assert(size > 0);
+ if (size == 1)
+ return get_latin1_char(u[0]);
+
+ max_char = ucs1lib_find_max_char(u, u + size);
+ res = PyUnicode_New(size, max_char);
+ if (!res)
+ return NULL;
+ memcpy(PyUnicode_1BYTE_DATA(res), u, size);
+ assert(_PyUnicode_CheckConsistency(res, 1));
+ return res;
+}
+
+static PyObject*
+_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
+{
+ PyObject *res;
+ Py_UCS2 max_char;
+
- if (size == 0) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
++ if (size == 0)
++ _Py_RETURN_UNICODE_EMPTY();
+ assert(size > 0);
+ if (size == 1) {
+ Py_UCS4 ch = u[0];
+ if (ch < 256)
+ return get_latin1_char((unsigned char)ch);
+
+ res = PyUnicode_New(1, ch);
+ if (res == NULL)
+ return NULL;
+ PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
+ assert(_PyUnicode_CheckConsistency(res, 1));
+ return res;
+ }
+
+ max_char = ucs2lib_find_max_char(u, u + size);
+ res = PyUnicode_New(size, max_char);
+ if (!res)
+ return NULL;
+ if (max_char >= 256)
+ memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
+ else {
+ _PyUnicode_CONVERT_BYTES(
+ Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
+ }
+ assert(_PyUnicode_CheckConsistency(res, 1));
+ return res;
+}
+
+static PyObject*
+_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
+{
+ PyObject *res;
+ Py_UCS4 max_char;
+
- if (size == 0) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
++ if (size == 0)
++ _Py_RETURN_UNICODE_EMPTY();
+ assert(size > 0);
+ if (size == 1) {
+ Py_UCS4 ch = u[0];
+ if (ch < 256)
+ return get_latin1_char((unsigned char)ch);
+
+ res = PyUnicode_New(1, ch);
+ if (res == NULL)
+ return NULL;
+ PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
+ assert(_PyUnicode_CheckConsistency(res, 1));
+ return res;
+ }
+
+ max_char = ucs4lib_find_max_char(u, u + size);
+ res = PyUnicode_New(size, max_char);
+ if (!res)
+ return NULL;
+ if (max_char < 256)
+ _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
+ PyUnicode_1BYTE_DATA(res));
+ else if (max_char < 0x10000)
+ _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
+ PyUnicode_2BYTE_DATA(res));
+ else
+ memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
+ assert(_PyUnicode_CheckConsistency(res, 1));
+ return res;
}
-PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
+PyObject*
+PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
{
- PyUnicodeObject *unicode;
-
if (size < 0) {
- PyErr_SetString(PyExc_SystemError,
- "Negative size passed to PyUnicode_FromStringAndSize");
+ PyErr_SetString(PyExc_ValueError, "size must be positive");
+ return NULL;
+ }
+ switch (kind) {
+ case PyUnicode_1BYTE_KIND:
+ return _PyUnicode_FromUCS1(buffer, size);
+ case PyUnicode_2BYTE_KIND:
+ return _PyUnicode_FromUCS2(buffer, size);
+ case PyUnicode_4BYTE_KIND:
+ return _PyUnicode_FromUCS4(buffer, size);
+ default:
+ PyErr_SetString(PyExc_SystemError, "invalid kind");
return NULL;
}
+}
- /* If the Unicode data is known at construction time, we can apply
- some optimizations which share commonly used objects.
- Also, this means the input must be UTF-8, so fall back to the
- UTF-8 decoder at the end. */
- if (u != NULL) {
+Py_UCS4
+_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
+{
+ enum PyUnicode_Kind kind;
+ void *startptr, *endptr;
- /* Optimization for empty strings */
- if (size == 0)
- _Py_RETURN_UNICODE_EMPTY();
+ assert(PyUnicode_IS_READY(unicode));
+ assert(0 <= start);
+ assert(end <= PyUnicode_GET_LENGTH(unicode));
+ assert(start <= end);
- /* Single characters are shared when using this constructor.
- Restrict to ASCII, since the input must be UTF-8. */
- if (size == 1 && Py_CHARMASK(*u) < 128) {
- unicode = unicode_latin1[Py_CHARMASK(*u)];
- if (!unicode) {
- unicode = _PyUnicode_New(1);
- if (!unicode)
- return NULL;
- unicode->str[0] = Py_CHARMASK(*u);
- unicode_latin1[Py_CHARMASK(*u)] = unicode;
- }
- Py_INCREF(unicode);
- return (PyObject *)unicode;
- }
+ if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
+ return PyUnicode_MAX_CHAR_VALUE(unicode);
- return PyUnicode_DecodeUTF8(u, size, NULL);
+ if (start == end)
+ return 127;
+
+ if (PyUnicode_IS_ASCII(unicode))
+ return 127;
+
+ kind = PyUnicode_KIND(unicode);
+ startptr = PyUnicode_DATA(unicode);
+ endptr = (char *)startptr + end * kind;
+ startptr = (char *)startptr + start * kind;
+ switch(kind) {
+ case PyUnicode_1BYTE_KIND:
+ return ucs1lib_find_max_char(startptr, endptr);
+ case PyUnicode_2BYTE_KIND:
+ return ucs2lib_find_max_char(startptr, endptr);
+ case PyUnicode_4BYTE_KIND:
+ return ucs4lib_find_max_char(startptr, endptr);
+ default:
+ assert(0);
+ return 0;
}
+}
- unicode = _PyUnicode_New(size);
- if (!unicode)
- return NULL;
+/* Ensure that a string uses the most efficient storage, if it is not the
+ case: create a new string with of the right kind. Write NULL into *p_unicode
+ on error. */
+static void
+unicode_adjust_maxchar(PyObject **p_unicode)
+{
+ PyObject *unicode, *copy;
+ Py_UCS4 max_char;
+ Py_ssize_t len;
+ unsigned int kind;
+
+ assert(p_unicode != NULL);
+ unicode = *p_unicode;
+ assert(PyUnicode_IS_READY(unicode));
+ if (PyUnicode_IS_ASCII(unicode))
+ return;
- return (PyObject *)unicode;
+ len = PyUnicode_GET_LENGTH(unicode);
+ kind = PyUnicode_KIND(unicode);
+ if (kind == PyUnicode_1BYTE_KIND) {
+ const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
+ max_char = ucs1lib_find_max_char(u, u + len);
+ if (max_char >= 128)
+ return;
+ }
+ else if (kind == PyUnicode_2BYTE_KIND) {
+ const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
+ max_char = ucs2lib_find_max_char(u, u + len);
+ if (max_char >= 256)
+ return;
+ }
+ else {
+ const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
+ assert(kind == PyUnicode_4BYTE_KIND);
+ max_char = ucs4lib_find_max_char(u, u + len);
+ if (max_char >= 0x10000)
+ return;
+ }
+ copy = PyUnicode_New(len, max_char);
+ if (copy != NULL)
+ _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
+ Py_DECREF(unicode);
+ *p_unicode = copy;
}
-PyObject *PyUnicode_FromString(const char *u)
+PyObject*
+_PyUnicode_Copy(PyObject *unicode)
{
- size_t size = strlen(u);
- if (size > PY_SSIZE_T_MAX) {
- PyErr_SetString(PyExc_OverflowError, "input too long");
+ Py_ssize_t length;
+ PyObject *copy;
+
+ if (!PyUnicode_Check(unicode)) {
+ PyErr_BadInternalCall();
return NULL;
}
+ if (PyUnicode_READY(unicode) == -1)
+ return NULL;
- return PyUnicode_FromStringAndSize(u, size);
-}
-
-#ifdef HAVE_WCHAR_H
+ length = PyUnicode_GET_LENGTH(unicode);
+ copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
+ if (!copy)
+ return NULL;
+ assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
-#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
-# define CONVERT_WCHAR_TO_SURROGATES
-#endif
+ Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
+ length * PyUnicode_KIND(unicode));
+ assert(_PyUnicode_CheckConsistency(copy, 1));
+ return copy;
+}
-#ifdef CONVERT_WCHAR_TO_SURROGATES
-/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
- to convert from UTF32 to UTF16. */
+/* Widen Unicode objects to larger buffers. Don't write terminating null
+ character. Return NULL on error. */
-PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
- Py_ssize_t size)
+void*
+_PyUnicode_AsKind(PyObject *s, unsigned int kind)
{
- PyUnicodeObject *unicode;
- register Py_ssize_t i;
- Py_ssize_t alloc;
- const wchar_t *orig_w;
+ Py_ssize_t len;
+ void *result;
+ unsigned int skind;
- if (w == NULL) {
- if (size == 0)
- return PyUnicode_FromStringAndSize(NULL, 0);
- PyErr_BadInternalCall();
+ if (PyUnicode_READY(s) == -1)
return NULL;
- }
- if (size == -1) {
- size = wcslen(w);
+ len = PyUnicode_GET_LENGTH(s);
+ skind = PyUnicode_KIND(s);
+ if (skind >= kind) {
+ PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
+ return NULL;
}
-
- alloc = size;
- orig_w = w;
- for (i = size; i > 0; i--) {
- if (*w > 0xFFFF)
- alloc++;
- w++;
+ switch (kind) {
+ case PyUnicode_2BYTE_KIND:
+ result = PyMem_Malloc(len * sizeof(Py_UCS2));
+ if (!result)
+ return PyErr_NoMemory();
+ assert(skind == PyUnicode_1BYTE_KIND);
+ _PyUnicode_CONVERT_BYTES(
+ Py_UCS1, Py_UCS2,
+ PyUnicode_1BYTE_DATA(s),
+ PyUnicode_1BYTE_DATA(s) + len,
+ result);
+ return result;
+ case PyUnicode_4BYTE_KIND:
+ result = PyMem_Malloc(len * sizeof(Py_UCS4));
+ if (!result)
+ return PyErr_NoMemory();
+ if (skind == PyUnicode_2BYTE_KIND) {
+ _PyUnicode_CONVERT_BYTES(
+ Py_UCS2, Py_UCS4,
+ PyUnicode_2BYTE_DATA(s),
+ PyUnicode_2BYTE_DATA(s) + len,
+ result);
+ }
+ else {
+ assert(skind == PyUnicode_1BYTE_KIND);
+ _PyUnicode_CONVERT_BYTES(
+ Py_UCS1, Py_UCS4,
+ PyUnicode_1BYTE_DATA(s),
+ PyUnicode_1BYTE_DATA(s) + len,
+ result);
+ }
+ return result;
+ default:
+ break;
}
- w = orig_w;
- unicode = _PyUnicode_New(alloc);
- if (!unicode)
- return NULL;
+ PyErr_SetString(PyExc_SystemError, "invalid kind");
+ return NULL;
+}
- /* Copy the wchar_t data into the new object */
- {
- register Py_UNICODE *u;
- u = PyUnicode_AS_UNICODE(unicode);
- for (i = size; i > 0; i--) {
- if (*w > 0xFFFF) {
- wchar_t ordinal = *w++;
- ordinal -= 0x10000;
- *u++ = 0xD800 | (ordinal >> 10);
- *u++ = 0xDC00 | (ordinal & 0x3FF);
- }
- else
- *u++ = *w++;
+static Py_UCS4*
+as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
+ int copy_null)
+{
+ int kind;
+ void *data;
+ Py_ssize_t len, targetlen;
+ if (PyUnicode_READY(string) == -1)
+ return NULL;
+ kind = PyUnicode_KIND(string);
+ data = PyUnicode_DATA(string);
+ len = PyUnicode_GET_LENGTH(string);
+ targetlen = len;
+ if (copy_null)
+ targetlen++;
+ if (!target) {
+ if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+ target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
+ if (!target) {
+ PyErr_NoMemory();
+ return NULL;
}
}
- return (PyObject *)unicode;
+ else {
+ if (targetsize < targetlen) {
+ PyErr_Format(PyExc_SystemError,
+ "string is longer than the buffer");
+ if (copy_null && 0 < targetsize)
+ target[0] = 0;
+ return NULL;
+ }
+ }
+ if (kind == PyUnicode_1BYTE_KIND) {
+ Py_UCS1 *start = (Py_UCS1 *) data;
+ _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
+ }
+ else if (kind == PyUnicode_2BYTE_KIND) {
+ Py_UCS2 *start = (Py_UCS2 *) data;
+ _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
+ }
+ else {
+ assert(kind == PyUnicode_4BYTE_KIND);
+ Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
+ }
+ if (copy_null)
+ target[len] = 0;
+ return target;
}
-#else
+Py_UCS4*
+PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
+ int copy_null)
+{
+ if (target == NULL || targetsize < 0) {
+ PyErr_BadInternalCall();
+ return NULL;
+ }
+ return as_ucs4(string, target, targetsize, copy_null);
+}
-PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
- Py_ssize_t size)
+Py_UCS4*
+PyUnicode_AsUCS4Copy(PyObject *string)
{
- PyUnicodeObject *unicode;
+ return as_ucs4(string, NULL, 0, 1);
+}
+#ifdef HAVE_WCHAR_H
+
+PyObject *
+PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
+{
if (w == NULL) {
- if (size == 0) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
+ if (size == 0)
- return PyUnicode_FromStringAndSize(NULL, 0);
++ _Py_RETURN_UNICODE_EMPTY();
PyErr_BadInternalCall();
return NULL;
}
if (size == 0) {
if (consumed)
*consumed = 0;
- Py_INCREF(unicode_empty);
- return unicode_empty;
- return (PyObject *)unicode;
++ _Py_RETURN_UNICODE_EMPTY();
}
- /* Unpack UTF-8 encoded data */
- p = unicode->str;
- e = s + size;
- aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
-
- while (s < e) {
- Py_UCS4 ch = (unsigned char)*s;
-
- if (ch < 0x80) {
- /* Fast path for runs of ASCII characters. Given that common UTF-8
- input will consist of an overwhelming majority of ASCII
- characters, we try to optimize for this case by checking
- as many characters as a C 'long' can contain.
- First, check if we can do an aligned read, as most CPUs have
- a penalty for unaligned reads.
- */
- if (!((size_t) s & LONG_PTR_MASK)) {
- /* Help register allocation */
- register const char *_s = s;
- register Py_UNICODE *_p = p;
- while (_s < aligned_end) {
- /* Read a whole long at a time (either 4 or 8 bytes),
- and do a fast unrolled copy if it only contains ASCII
- characters. */
- unsigned long data = *(unsigned long *) _s;
- if (data & ASCII_CHAR_MASK)
- break;
- _p[0] = (unsigned char) _s[0];
- _p[1] = (unsigned char) _s[1];
- _p[2] = (unsigned char) _s[2];
- _p[3] = (unsigned char) _s[3];
-#if (SIZEOF_LONG == 8)
- _p[4] = (unsigned char) _s[4];
- _p[5] = (unsigned char) _s[5];
- _p[6] = (unsigned char) _s[6];
- _p[7] = (unsigned char) _s[7];
-#endif
- _s += SIZEOF_LONG;
- _p += SIZEOF_LONG;
- }
- s = _s;
- p = _p;
- if (s == e)
- break;
- ch = (unsigned char)*s;
- }
- }
-
- if (ch < 0x80) {
- *p++ = (Py_UNICODE)ch;
- s++;
- continue;
- }
+ /* ASCII is equivalent to the first 128 ordinals in Unicode. */
+ if (size == 1 && (unsigned char)s[0] < 128) {
+ if (consumed)
+ *consumed = 1;
+ return get_latin1_char((unsigned char)s[0]);
+ }
- n = utf8_code_length[ch];
+ unicode = PyUnicode_New(size, 127);
+ if (!unicode)
+ return NULL;
- if (s + n > e) {
- if (consumed)
- break;
- else {
- errmsg = "unexpected end of data";
- startinpos = s-starts;
- endinpos = startinpos+1;
- for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
- endinpos++;
- goto utf8Error;
- }
+ outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
+ s += outpos;
+ while (s < end) {
+ Py_UCS4 ch;
+ int kind = PyUnicode_KIND(unicode);
+ if (kind == PyUnicode_1BYTE_KIND) {
+ if (PyUnicode_IS_ASCII(unicode))
+ ch = asciilib_utf8_decode(&s, end,
+ PyUnicode_1BYTE_DATA(unicode), &outpos);
+ else
+ ch = ucs1lib_utf8_decode(&s, end,
+ PyUnicode_1BYTE_DATA(unicode), &outpos);
+ } else if (kind == PyUnicode_2BYTE_KIND) {
+ ch = ucs2lib_utf8_decode(&s, end,
+ PyUnicode_2BYTE_DATA(unicode), &outpos);
+ } else {
+ assert(kind == PyUnicode_4BYTE_KIND);
+ ch = ucs4lib_utf8_decode(&s, end,
+ PyUnicode_4BYTE_DATA(unicode), &outpos);
}
- switch (n) {
-
+ switch (ch) {
case 0:
- errmsg = "invalid start byte";
- startinpos = s-starts;
- endinpos = startinpos+1;
- goto utf8Error;
-
+ if (s == end || consumed)
+ goto End;
+ errmsg = "unexpected end of data";
+ startinpos = s - starts;
+ endinpos = end - starts;
+ break;
case 1:
- errmsg = "internal error";
- startinpos = s-starts;
- endinpos = startinpos+1;
- goto utf8Error;
-
- case 2:
- if ((s[1] & 0xc0) != 0x80) {
- errmsg = "invalid continuation byte";
- startinpos = s-starts;
- endinpos = startinpos + 1;
- goto utf8Error;
- }
- ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
- assert ((ch > 0x007F) && (ch <= 0x07FF));
- *p++ = (Py_UNICODE)ch;
+ errmsg = "invalid start byte";
+ startinpos = s - starts;
+ endinpos = startinpos + 1;
break;
-
+ case 2:
case 3:
- /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
- will result in surrogates in range d800-dfff. Surrogates are
- not valid UTF-8 so they are rejected.
- See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
- (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
- if ((s[1] & 0xc0) != 0x80 ||
- (s[2] & 0xc0) != 0x80 ||
- ((unsigned char)s[0] == 0xE0 &&
- (unsigned char)s[1] < 0xA0) ||
- ((unsigned char)s[0] == 0xED &&
- (unsigned char)s[1] > 0x9F)) {
- errmsg = "invalid continuation byte";
- startinpos = s-starts;
- endinpos = startinpos + 1;
-
- /* if s[1] first two bits are 1 and 0, then the invalid
- continuation byte is s[2], so increment endinpos by 1,
- if not, s[1] is invalid and endinpos doesn't need to
- be incremented. */
- if ((s[1] & 0xC0) == 0x80)
- endinpos++;
- goto utf8Error;
- }
- ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
- assert ((ch > 0x07FF) && (ch <= 0xFFFF));
- *p++ = (Py_UNICODE)ch;
- break;
-
case 4:
- if ((s[1] & 0xc0) != 0x80 ||
- (s[2] & 0xc0) != 0x80 ||
- (s[3] & 0xc0) != 0x80 ||
- ((unsigned char)s[0] == 0xF0 &&
- (unsigned char)s[1] < 0x90) ||
- ((unsigned char)s[0] == 0xF4 &&
- (unsigned char)s[1] > 0x8F)) {
- errmsg = "invalid continuation byte";
- startinpos = s-starts;
- endinpos = startinpos + 1;
- if ((s[1] & 0xC0) == 0x80) {
- endinpos++;
- if ((s[2] & 0xC0) == 0x80)
- endinpos++;
- }
- goto utf8Error;
- }
- ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
- ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
- assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
-
-#ifdef Py_UNICODE_WIDE
- *p++ = (Py_UNICODE)ch;
-#else
- /* compute and append the two surrogates: */
-
- /* translate from 10000..10FFFF to 0..FFFF */
- ch -= 0x10000;
-
- /* high surrogate = top 10 bits added to D800 */
- *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
-
- /* low surrogate = bottom 10 bits added to DC00 */
- *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
-#endif
+ errmsg = "invalid continuation byte";
+ startinpos = s - starts;
+ endinpos = startinpos + ch - 1;
break;
+ default:
+ if (unicode_putchar(&unicode, &outpos, ch) < 0)
+ goto onError;
+ continue;
}
- s += n;
- continue;
- utf8Error:
- outpos = p-PyUnicode_AS_UNICODE(unicode);
if (unicode_decode_call_errorhandler(
errors, &errorHandler,
"utf-8", errmsg,
byte order setting accordingly. In native mode, the leading BOM
mark is skipped, in all other modes, it is copied to the output
stream as-is (giving a ZWNBSP character). */
- if (bo == 0) {
- if (size >= 2) {
- const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN
- if (bom == 0xFEFF) {
- q += 2;
- bo = -1;
- }
- else if (bom == 0xFFFE) {
- q += 2;
- bo = 1;
- }
-#else
- if (bom == 0xFEFF) {
- q += 2;
- bo = 1;
- }
- else if (bom == 0xFFFE) {
- q += 2;
- bo = -1;
- }
-#endif
+ if (bo == 0 && size >= 2) {
+ const Py_UCS4 bom = (q[1] << 8) | q[0];
+ if (bom == 0xFEFF) {
+ q += 2;
+ bo = -1;
+ }
+ else if (bom == 0xFFFE) {
+ q += 2;
+ bo = 1;
}
+ if (byteorder)
+ *byteorder = bo;
}
- if (bo == -1) {
- /* force LE */
- ihi = 1;
- ilo = 0;
- }
- else if (bo == 1) {
- /* force BE */
- ihi = 0;
- ilo = 1;
+ if (q == e) {
+ if (consumed)
+ *consumed = size;
- Py_INCREF(unicode_empty);
- return unicode_empty;
++ _Py_RETURN_UNICODE_EMPTY();
}
+
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
- native_ordering = ilo < ihi;
+ native_ordering = bo <= 0;
#else
- native_ordering = ilo > ihi;
+ native_ordering = bo >= 0;
#endif
- aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
+ /* Note: size will always be longer than the resulting Unicode
+ character count */
+ unicode = PyUnicode_New((e - q + 1) / 2, 127);
+ if (!unicode)
+ return NULL;
+
+ outpos = 0;
while (1) {
- Py_UNICODE ch;
- if (e - q < 2) {
+ Py_UCS4 ch = 0;
+ if (e - q >= 2) {
+ int kind = PyUnicode_KIND(unicode);
+ if (kind == PyUnicode_1BYTE_KIND) {
+ if (PyUnicode_IS_ASCII(unicode))
+ ch = asciilib_utf16_decode(&q, e,
+ PyUnicode_1BYTE_DATA(unicode), &outpos,
+ native_ordering);
+ else
+ ch = ucs1lib_utf16_decode(&q, e,
+ PyUnicode_1BYTE_DATA(unicode), &outpos,
+ native_ordering);
+ } else if (kind == PyUnicode_2BYTE_KIND) {
+ ch = ucs2lib_utf16_decode(&q, e,
+ PyUnicode_2BYTE_DATA(unicode), &outpos,
+ native_ordering);
+ } else {
+ assert(kind == PyUnicode_4BYTE_KIND);
+ ch = ucs4lib_utf16_decode(&q, e,
+ PyUnicode_4BYTE_DATA(unicode), &outpos,
+ native_ordering);
+ }
+ }
+
+ switch (ch)
+ {
+ case 0:
/* remaining byte at the end? (size should be even) */
if (q == e || consumed)
- break;
+ goto End;
errmsg = "truncated data";
startinpos = ((const char *)q) - starts;
endinpos = ((const char *)e) - starts;
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
- if (size == 0) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
++ if (size == 0)
++ _Py_RETURN_UNICODE_EMPTY();
+
/* ASCII is equivalent to the first 128 ordinals in Unicode. */
- if (size == 1 && *(unsigned char*)s < 128) {
- Py_UNICODE r = *(unsigned char*)s;
- return PyUnicode_FromUnicode(&r, 1);
- }
+ if (size == 1 && (unsigned char)s[0] < 128)
+ return get_latin1_char((unsigned char)s[0]);
- v = _PyUnicode_New(size);
- if (v == NULL)
+ unicode = PyUnicode_New(size, 127);
+ if (unicode == NULL)
goto onError;
- if (size == 0)
- return (PyObject *)v;
- p = PyUnicode_AS_UNICODE(v);
+
e = s + size;
+ data = PyUnicode_1BYTE_DATA(unicode);
+ outpos = ascii_decode(s, e, (Py_UCS1 *)data);
+ if (outpos == size)
+ return unicode;
+
+ s += outpos;
+ kind = PyUnicode_1BYTE_KIND;
while (s < e) {
register unsigned char c = (unsigned char)*s;
if (c < 128) {
#define NEED_RETRY
#endif
-/* XXX This code is limited to "true" double-byte encodings, as
- a) it assumes an incomplete character consists of a single byte, and
- b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
- encodings, see IsDBCSLeadByteEx documentation. */
+#ifndef WC_ERR_INVALID_CHARS
+# define WC_ERR_INVALID_CHARS 0x0080
+#endif
+
+static char*
+code_page_name(UINT code_page, PyObject **obj)
+{
+ *obj = NULL;
+ if (code_page == CP_ACP)
+ return "mbcs";
+ if (code_page == CP_UTF7)
+ return "CP_UTF7";
+ if (code_page == CP_UTF8)
+ return "CP_UTF8";
+
+ *obj = PyBytes_FromFormat("cp%u", code_page);
+ if (*obj == NULL)
+ return NULL;
+ return PyBytes_AS_STRING(*obj);
+}
+
+static int
+is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
+{
+ const char *curr = s + offset;
+ const char *prev;
+
+ if (!IsDBCSLeadByteEx(code_page, *curr))
+ return 0;
+
+ prev = CharPrevExA(code_page, s, curr, 0);
+ if (prev == curr)
+ return 1;
+ /* FIXME: This code is limited to "true" double-byte encodings,
+ as it assumes an incomplete character consists of a single
+ byte. */
+ if (curr - prev == 2)
+ return 1;
+ if (!IsDBCSLeadByteEx(code_page, *prev))
+ return 1;
+ return 0;
+}
+
+static DWORD
+decode_code_page_flags(UINT code_page)
+{
+ if (code_page == CP_UTF7) {
+ /* The CP_UTF7 decoder only supports flags=0 */
+ return 0;
+ }
+ else
+ return MB_ERR_INVALID_CHARS;
+}
+
+/*
+ * Decode a byte string from a Windows code page into unicode object in strict
+ * mode.
+ *
+ * Returns consumed size if succeed, returns -2 on decode error, or raise a
+ * WindowsError and returns -1 on other error.
+ */
+static int
+decode_code_page_strict(UINT code_page,
+ PyObject **v,
+ const char *in,
+ int insize)
+{
+ const DWORD flags = decode_code_page_flags(code_page);
+ wchar_t *out;
+ DWORD outsize;
+
+ /* First get the size of the result */
+ assert(insize > 0);
+ outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
+ if (outsize <= 0)
+ goto error;
+
+ if (*v == NULL) {
+ /* Create unicode object */
+ /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
+ *v = (PyObject*)_PyUnicode_New(outsize);
+ if (*v == NULL)
+ return -1;
+ out = PyUnicode_AS_UNICODE(*v);
+ }
+ else {
+ /* Extend unicode object */
+ Py_ssize_t n = PyUnicode_GET_SIZE(*v);
+ if (unicode_resize(v, n + outsize) < 0)
+ return -1;
+ out = PyUnicode_AS_UNICODE(*v) + n;
+ }
+
+ /* Do the conversion */
+ outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
+ if (outsize <= 0)
+ goto error;
+ return insize;
+
+error:
+ if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
+ return -2;
+ PyErr_SetFromWindowsErr(0);
+ return -1;
+}
+
+/*
+ * Decode a byte string from a code page into unicode object with an error
+ * handler.
+ *
+ * Returns consumed size if succeed, or raise a WindowsError or
+ * UnicodeDecodeError exception and returns -1 on error.
+ */
+static int
+decode_code_page_errors(UINT code_page,
+ PyObject **v,
+ const char *in, const int size,
+ const char *errors)
+{
+ const char *startin = in;
+ const char *endin = in + size;
+ const DWORD flags = decode_code_page_flags(code_page);
+ /* Ideally, we should get reason from FormatMessage. This is the Windows
+ 2000 English version of the message. */
+ const char *reason = "No mapping for the Unicode character exists "
+ "in the target code page.";
+ /* each step cannot decode more than 1 character, but a character can be
+ represented as a surrogate pair */
+ wchar_t buffer[2], *startout, *out;
+ int insize, outsize;
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
+ PyObject *encoding_obj = NULL;
+ char *encoding;
+ DWORD err;
+ int ret = -1;
+
+ assert(size > 0);
+
+ encoding = code_page_name(code_page, &encoding_obj);
+ if (encoding == NULL)
+ return -1;
+
+ if (errors == NULL || strcmp(errors, "strict") == 0) {
+ /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
+ UnicodeDecodeError. */
+ make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
+ if (exc != NULL) {
+ PyCodec_StrictErrors(exc);
+ Py_CLEAR(exc);
+ }
+ goto error;
+ }
+
+ if (*v == NULL) {
+ /* Create unicode object */
+ if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
+ PyErr_NoMemory();
+ goto error;
+ }
+ /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
+ *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
+ if (*v == NULL)
+ goto error;
+ startout = PyUnicode_AS_UNICODE(*v);
+ }
+ else {
+ /* Extend unicode object */
+ Py_ssize_t n = PyUnicode_GET_SIZE(*v);
+ if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
+ PyErr_NoMemory();
+ goto error;
+ }
+ if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
+ goto error;
+ startout = PyUnicode_AS_UNICODE(*v) + n;
+ }
+
+ /* Decode the byte string character per character */
+ out = startout;
+ while (in < endin)
+ {
+ /* Decode a character */
+ insize = 1;
+ do
+ {
+ outsize = MultiByteToWideChar(code_page, flags,
+ in, insize,
+ buffer, Py_ARRAY_LENGTH(buffer));
+ if (outsize > 0)
+ break;
+ err = GetLastError();
+ if (err != ERROR_NO_UNICODE_TRANSLATION
+ && err != ERROR_INSUFFICIENT_BUFFER)
+ {
+ PyErr_SetFromWindowsErr(0);
+ goto error;
+ }
+ insize++;
+ }
+ /* 4=maximum length of a UTF-8 sequence */
+ while (insize <= 4 && (in + insize) <= endin);
+
+ if (outsize <= 0) {
+ Py_ssize_t startinpos, endinpos, outpos;
+
+ startinpos = in - startin;
+ endinpos = startinpos + 1;
+ outpos = out - PyUnicode_AS_UNICODE(*v);
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ encoding, reason,
+ &startin, &endin, &startinpos, &endinpos, &exc, &in,
+ v, &outpos))
+ {
+ goto error;
+ }
+ out = PyUnicode_AS_UNICODE(*v) + outpos;
+ }
+ else {
+ in += insize;
+ memcpy(out, buffer, outsize * sizeof(wchar_t));
+ out += outsize;
+ }
+ }
+
+ /* write a NUL character at the end */
+ *out = 0;
+
+ /* Extend unicode object */
+ outsize = out - startout;
+ assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
+ if (unicode_resize(v, outsize) < 0)
+ goto error;
+ ret = size;
+
+error:
+ Py_XDECREF(encoding_obj);
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
+ return ret;
+}
+
+static PyObject *
+decode_code_page_stateful(int code_page,
+ const char *s, Py_ssize_t size,
+ const char *errors, Py_ssize_t *consumed)
+{
+ PyObject *v = NULL;
+ int chunk_size, final, converted, done;
+
+ if (code_page < 0) {
+ PyErr_SetString(PyExc_ValueError, "invalid code page number");
+ return NULL;
+ }
+
+ if (consumed)
+ *consumed = 0;
+
+ do
+ {
+#ifdef NEED_RETRY
+ if (size > INT_MAX) {
+ chunk_size = INT_MAX;
+ final = 0;
+ done = 0;
+ }
+ else
+#endif
+ {
+ chunk_size = (int)size;
+ final = (consumed == NULL);
+ done = 1;
+ }
+
+ /* Skip trailing lead-byte unless 'final' is set */
+ if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
+ --chunk_size;
+
+ if (chunk_size == 0 && done) {
+ if (v != NULL)
+ break;
- Py_INCREF(unicode_empty);
- return unicode_empty;
++ _Py_RETURN_UNICODE_EMPTY();
+ }
+
+
+ converted = decode_code_page_strict(code_page, &v,
+ s, chunk_size);
+ if (converted == -2)
+ converted = decode_code_page_errors(code_page, &v,
+ s, chunk_size,
+ errors);
+ assert(converted != 0);
+
+ if (converted < 0) {
+ Py_XDECREF(v);
+ return NULL;
+ }
+
+ if (consumed)
+ *consumed += converted;
+
+ s += converted;
+ size -= converted;
+ } while (!done);
+
+ return unicode_result(v);
+}
+
+PyObject *
+PyUnicode_DecodeCodePageStateful(int code_page,
+ const char *s,
+ Py_ssize_t size,
+ const char *errors,
+ Py_ssize_t *consumed)
+{
+ return decode_code_page_stateful(code_page, s, size, errors, consumed);
+}
+
+PyObject *
+PyUnicode_DecodeMBCSStateful(const char *s,
+ Py_ssize_t size,
+ const char *errors,
+ Py_ssize_t *consumed)
+{
+ return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
+}
-static int is_dbcs_lead_byte(const char *s, int offset)
+PyObject *
+PyUnicode_DecodeMBCS(const char *s,
+ Py_ssize_t size,
+ const char *errors)
{
- const char *curr = s + offset;
+ return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
+}
- if (IsDBCSLeadByte(*curr)) {
- const char *prev = CharPrev(s, curr);
- return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
+static DWORD
+encode_code_page_flags(UINT code_page, const char *errors)
+{
+ if (code_page == CP_UTF8) {
+ if (winver.dwMajorVersion >= 6)
+ /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
+ and later */
+ return WC_ERR_INVALID_CHARS;
+ else
+ /* CP_UTF8 only supports flags=0 on Windows older than Vista */
+ return 0;
+ }
+ else if (code_page == CP_UTF7) {
+ /* CP_UTF7 only supports flags=0 */
+ return 0;
+ }
+ else {
+ if (errors != NULL && strcmp(errors, "replace") == 0)
+ return 0;
+ else
+ return WC_NO_BEST_FIT_CHARS;
}
- return 0;
}
/*
seqlen = PySequence_Fast_GET_SIZE(fseq);
/* If empty sequence, return u"". */
if (seqlen == 0) {
- res = _PyUnicode_New(0); /* empty sequence; return u"" */
- goto Done;
+ Py_DECREF(fseq);
- Py_INCREF(unicode_empty);
- res = unicode_empty;
- return res;
++ _Py_RETURN_UNICODE_EMPTY();
}
- items = PySequence_Fast_ITEMS(fseq);
+
/* If singleton sequence with an exact Unicode, return that. */
+ last_obj = NULL;
+ items = PySequence_Fast_ITEMS(fseq);
if (seqlen == 1) {
- item = items[0];
- if (PyUnicode_CheckExact(item)) {
- Py_INCREF(item);
- res = (PyUnicodeObject *)item;
- goto Done;
+ if (PyUnicode_CheckExact(items[0])) {
+ res = items[0];
+ Py_INCREF(res);
+ Py_DECREF(fseq);
+ return res;
}
+ seplen = 0;
+ maxchar = 0;
}
else {
/* Set up sep and seplen */
if (maxcount < 0)
maxcount = PY_SSIZE_T_MAX;
- else if (maxcount == 0 || self->length == 0)
+ else if (maxcount == 0 || slen == 0)
goto nothing;
- if (str1->length == str2->length) {
- Py_ssize_t i;
+ if (str1 == str2)
+ goto nothing;
+ if (skind < kind1)
+ /* substring too wide to be present */
+ goto nothing;
+
+ maxchar = PyUnicode_MAX_CHAR_VALUE(self);
+ maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
+ /* Replacing str1 with str2 may cause a maxchar reduction in the
+ result string. */
+ mayshrink = (maxchar_str2 < maxchar);
+ maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
+
+ if (len1 == len2) {
/* same length */
- if (str1->length == 0)
+ if (len1 == 0)
goto nothing;
- if (str1->length == 1) {
+ if (len1 == 1) {
/* replace characters */
- Py_UNICODE u1, u2;
- if (!findchar(self->str, self->length, str1->str[0]))
+ Py_UCS4 u1, u2;
+ int rkind;
+ Py_ssize_t index, pos;
+ char *src;
+
+ u1 = PyUnicode_READ_CHAR(str1, 0);
+ pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
+ if (pos < 0)
goto nothing;
- u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
+ u2 = PyUnicode_READ_CHAR(str2, 0);
+ u = PyUnicode_New(slen, maxchar);
if (!u)
- return NULL;
- Py_UNICODE_COPY(u->str, self->str, self->length);
- u1 = str1->str[0];
- u2 = str2->str[0];
- for (i = 0; i < u->length; i++)
- if (u->str[i] == u1) {
- if (--maxcount < 0)
- break;
- u->str[i] = u2;
- }
- } else {
- i = stringlib_find(
- self->str, self->length, str1->str, str1->length, 0
- );
+ goto error;
+ _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
+ rkind = PyUnicode_KIND(u);
+
+ PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
+ index = 0;
+ src = sbuf;
+ while (--maxcount)
+ {
+ pos++;
+ src += pos * PyUnicode_KIND(self);
+ slen -= pos;
+ index += pos;
+ pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
+ if (pos < 0)
+ break;
+ PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
+ }
+ }
+ else {
+ int rkind = skind;
+ char *res;
+ Py_ssize_t i;
+
+ if (kind1 < rkind) {
+ /* widen substring */
+ buf1 = _PyUnicode_AsKind(str1, rkind);
+ if (!buf1) goto error;
+ release1 = 1;
+ }
+ i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
if (i < 0)
goto nothing;
- u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
+ if (rkind > kind2) {
+ /* widen replacement */
+ buf2 = _PyUnicode_AsKind(str2, rkind);
+ if (!buf2) goto error;
+ release2 = 1;
+ }
+ else if (rkind < kind2) {
+ /* widen self and buf1 */
+ rkind = kind2;
+ if (release1) PyMem_Free(buf1);
+ release1 = 0;
+ sbuf = _PyUnicode_AsKind(self, rkind);
+ if (!sbuf) goto error;
+ srelease = 1;
+ buf1 = _PyUnicode_AsKind(str1, rkind);
+ if (!buf1) goto error;
+ release1 = 1;
+ }
+ u = PyUnicode_New(slen, maxchar);
if (!u)
- return NULL;
- Py_UNICODE_COPY(u->str, self->str, self->length);
+ goto error;
+ assert(PyUnicode_KIND(u) == rkind);
+ res = PyUnicode_DATA(u);
+ memcpy(res, sbuf, rkind * slen);
/* change everything in-place, starting with this one */
- Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
- i += str1->length;
+ memcpy(res + rkind * i,
+ buf2,
+ rkind * len2);
+ i += len1;
while ( --maxcount > 0) {
- i = stringlib_find(self->str+i, self->length-i,
- str1->str, str1->length,
- i);
+ i = anylib_find(rkind, self,
+ sbuf+rkind*i, slen-i,
+ str1, buf1, len1, i);
if (i == -1)
break;
- Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
- i += str1->length;
+ memcpy(res + rkind * i,
+ buf2,
+ rkind * len2);
+ i += len1;
}
}
- } else {
-
- Py_ssize_t n, i, j;
- Py_ssize_t product, new_size, delta;
- Py_UNICODE *p;
-
- /* replace strings */
- n = stringlib_count(self->str, self->length, str1->str, str1->length,
- maxcount);
+ }
+ else {
+ Py_ssize_t n, i, j, ires;
+ Py_ssize_t new_size;
+ int rkind = skind;
+ char *res;
+
+ if (kind1 < rkind) {
+ /* widen substring */
+ buf1 = _PyUnicode_AsKind(str1, rkind);
+ if (!buf1) goto error;
+ release1 = 1;
+ }
+ n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
if (n == 0)
goto nothing;
- /* new_size = self->length + n * (str2->length - str1->length)); */
- delta = (str2->length - str1->length);
- if (delta == 0) {
- new_size = self->length;
- } else {
- product = n * (str2->length - str1->length);
- if ((product / (str2->length - str1->length)) != n) {
- PyErr_SetString(PyExc_OverflowError,
- "replace string is too long");
- return NULL;
- }
- new_size = self->length + product;
- if (new_size < 0) {
+ if (kind2 < rkind) {
+ /* widen replacement */
+ buf2 = _PyUnicode_AsKind(str2, rkind);
+ if (!buf2) goto error;
+ release2 = 1;
+ }
+ else if (kind2 > rkind) {
+ /* widen self and buf1 */
+ rkind = kind2;
+ sbuf = _PyUnicode_AsKind(self, rkind);
+ if (!sbuf) goto error;
+ srelease = 1;
+ if (release1) PyMem_Free(buf1);
+ release1 = 0;
+ buf1 = _PyUnicode_AsKind(str1, rkind);
+ if (!buf1) goto error;
+ release1 = 1;
+ }
+ /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
+ PyUnicode_GET_LENGTH(str1))); */
+ if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
PyErr_SetString(PyExc_OverflowError,
"replace string is too long");
- return NULL;
- }
+ goto error;
}
- u = _PyUnicode_New(new_size);
+ new_size = slen + n * (len2 - len1);
+ if (new_size == 0) {
- Py_INCREF(unicode_empty);
++ _Py_INCREF_UNICODE_EMPTY();
++ if (!unicode_empty)
++ goto error;
+ u = unicode_empty;
+ goto done;
+ }
+ if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
+ PyErr_SetString(PyExc_OverflowError,
+ "replace string is too long");
+ goto error;
+ }
+ u = PyUnicode_New(new_size, maxchar);
if (!u)
- return NULL;
- i = 0;
- p = u->str;
- if (str1->length > 0) {
+ goto error;
+ assert(PyUnicode_KIND(u) == rkind);
+ res = PyUnicode_DATA(u);
+ ires = i = 0;
+ if (len1 > 0) {
while (n-- > 0) {
/* look for next match */
- j = stringlib_find(self->str+i, self->length-i,
- str1->str, str1->length,
- i);
+ j = anylib_find(rkind, self,
+ sbuf + rkind * i, slen-i,
+ str1, buf1, len1, i);
if (j == -1)
break;
else if (j > i) {
j++;
}
- if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
- Py_INCREF(self);
- return (PyObject*)self;
- }
- else
- return PyUnicode_FromUnicode(s+i, j-i);
+ return PyUnicode_Substring(self, i, j);
}
- if (start >= length || end < start) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
+PyObject*
+PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
+{
+ unsigned char *data;
+ int kind;
+ Py_ssize_t length;
+
+ if (PyUnicode_READY(self) == -1)
+ return NULL;
+
+ length = PyUnicode_GET_LENGTH(self);
+ end = Py_MIN(end, length);
+
+ if (start == 0 && end == length)
+ return unicode_result_unchanged(self);
+
+ if (start < 0 || end < 0) {
+ PyErr_SetString(PyExc_IndexError, "string index out of range");
+ return NULL;
+ }
++ if (start >= length || end < start)
++ _Py_RETURN_UNICODE_EMPTY();
+
+ length = end - start;
+ if (PyUnicode_IS_ASCII(self)) {
+ data = PyUnicode_1BYTE_DATA(self);
+ return _PyUnicode_FromASCII((char*)(data + start), length);
+ }
+ else {
+ kind = PyUnicode_KIND(self);
+ data = PyUnicode_1BYTE_DATA(self);
+ return PyUnicode_FromKindAndData(kind,
+ data + kind * start,
+ length);
+ }
+}
static PyObject *
-do_strip(PyUnicodeObject *self, int striptype)
+do_strip(PyObject *self, int striptype)
{
- Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
- Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
+ int kind;
+ void *data;
+ Py_ssize_t len, i, j;
+
+ if (PyUnicode_READY(self) == -1)
+ return NULL;
+
+ kind = PyUnicode_KIND(self);
+ data = PyUnicode_DATA(self);
+ len = PyUnicode_GET_LENGTH(self);
i = 0;
if (striptype != RIGHTSTRIP) {
static PyObject*
-unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
+unicode_repeat(PyObject *str, Py_ssize_t len)
{
- PyUnicodeObject *u;
- Py_UNICODE *p;
- Py_ssize_t nchars;
- size_t nbytes;
+ PyObject *u;
+ Py_ssize_t nchars, n;
- if (len < 1) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
+ if (len < 1)
+ _Py_RETURN_UNICODE_EMPTY();
- if (len == 1 && PyUnicode_CheckExact(str)) {
- /* no repeat, return original string */
- Py_INCREF(str);
- return (PyObject*) str;
- }
+ /* no repeat, return original string */
+ if (len == 1)
+ return unicode_result_unchanged(str);
- /* ensure # of chars needed doesn't overflow int and # of bytes
- * needed doesn't overflow size_t
- */
- nchars = len * str->length;
- if (nchars / len != str->length) {
- PyErr_SetString(PyExc_OverflowError,
- "repeated string is too long");
+ if (PyUnicode_READY(str) == -1)
return NULL;
- }
- nbytes = (nchars + 1) * sizeof(Py_UNICODE);
- if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
+
+ if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
PyErr_SetString(PyExc_OverflowError,
"repeated string is too long");
return NULL;
return PyBool_FromLong(result);
}
-#include "stringlib/string_format.h"
+Py_LOCAL_INLINE(void)
+_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
+{
+ writer->size = PyUnicode_GET_LENGTH(writer->buffer);
+ writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
+ writer->data = PyUnicode_DATA(writer->buffer);
+ writer->kind = PyUnicode_KIND(writer->buffer);
+}
+
+void
+_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
+{
+ memset(writer, 0, sizeof(*writer));
+#ifdef Py_DEBUG
+ writer->kind = 5; /* invalid kind */
+#endif
+ writer->min_length = Py_MAX(min_length, 100);
+ writer->overallocate = (min_length > 0);
+}
+
+int
+_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
+ Py_ssize_t length, Py_UCS4 maxchar)
+{
+ Py_ssize_t newlen;
+ PyObject *newbuffer;
+
+ assert(length > 0);
+
+ if (length > PY_SSIZE_T_MAX - writer->pos) {
+ PyErr_NoMemory();
+ return -1;
+ }
+ newlen = writer->pos + length;
+
+ if (writer->buffer == NULL) {
+ if (writer->overallocate) {
+ /* overallocate 25% to limit the number of resize */
+ if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
+ newlen += newlen / 4;
+ if (newlen < writer->min_length)
+ newlen = writer->min_length;
+ }
+ writer->buffer = PyUnicode_New(newlen, maxchar);
+ if (writer->buffer == NULL)
+ return -1;
+ _PyUnicodeWriter_Update(writer);
+ return 0;
+ }
+
+ if (newlen > writer->size) {
+ if (writer->overallocate) {
+ /* overallocate 25% to limit the number of resize */
+ if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
+ newlen += newlen / 4;
+ if (newlen < writer->min_length)
+ newlen = writer->min_length;
+ }
+
+ if (maxchar > writer->maxchar || writer->readonly) {
+ /* resize + widen */
+ newbuffer = PyUnicode_New(newlen, maxchar);
+ if (newbuffer == NULL)
+ return -1;
+ _PyUnicode_FastCopyCharacters(newbuffer, 0,
+ writer->buffer, 0, writer->pos);
+ Py_DECREF(writer->buffer);
+ writer->readonly = 0;
+ }
+ else {
+ newbuffer = resize_compact(writer->buffer, newlen);
+ if (newbuffer == NULL)
+ return -1;
+ }
+ writer->buffer = newbuffer;
+ _PyUnicodeWriter_Update(writer);
+ }
+ else if (maxchar > writer->maxchar) {
+ assert(!writer->readonly);
+ newbuffer = PyUnicode_New(writer->size, maxchar);
+ if (newbuffer == NULL)
+ return -1;
+ _PyUnicode_FastCopyCharacters(newbuffer, 0,
+ writer->buffer, 0, writer->pos);
+ Py_DECREF(writer->buffer);
+ writer->buffer = newbuffer;
+ _PyUnicodeWriter_Update(writer);
+ }
+ return 0;
+}
+
+int
+_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
+{
+ Py_UCS4 maxchar;
+ Py_ssize_t len;
+
+ if (PyUnicode_READY(str) == -1)
+ return -1;
+ len = PyUnicode_GET_LENGTH(str);
+ if (len == 0)
+ return 0;
+ maxchar = PyUnicode_MAX_CHAR_VALUE(str);
+ if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
+ if (writer->buffer == NULL && !writer->overallocate) {
+ Py_INCREF(str);
+ writer->buffer = str;
+ _PyUnicodeWriter_Update(writer);
+ writer->readonly = 1;
+ writer->size = 0;
+ writer->pos += len;
+ return 0;
+ }
+ if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
+ return -1;
+ }
+ _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
+ str, 0, len);
+ writer->pos += len;
+ return 0;
+}
+
+PyObject *
+_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
+{
+ if (writer->pos == 0) {
+ Py_XDECREF(writer->buffer);
- Py_INCREF(unicode_empty);
- return unicode_empty;
++ _Py_RETURN_UNICODE_EMPTY();
+ }
+ if (writer->readonly) {
+ assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
+ return writer->buffer;
+ }
+ if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
+ PyObject *newbuffer;
+ newbuffer = resize_compact(writer->buffer, writer->pos);
+ if (newbuffer == NULL) {
+ Py_DECREF(writer->buffer);
+ return NULL;
+ }
+ writer->buffer = newbuffer;
+ }
+ assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
+ return writer->buffer;
+}
+
+void
+_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
+{
+ Py_CLEAR(writer->buffer);
+}
+
+#include "stringlib/unicode_format.h"
PyDoc_STRVAR(format__doc__,
"S.format(*args, **kwargs) -> str\n\
}
if (slicelength <= 0) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- return PyUnicode_FromUnicode(NULL, 0);
- } else if (start == 0 && step == 1 && slicelength == self->length &&
- PyUnicode_CheckExact(self)) {
- Py_INCREF(self);
- return (PyObject *)self;
++ _Py_RETURN_UNICODE_EMPTY();
+ } else if (start == 0 && step == 1 &&
+ slicelength == PyUnicode_GET_LENGTH(self)) {
+ return unicode_result_unchanged(self);
} else if (step == 1) {
- return PyUnicode_FromUnicode(self->str + start, slicelength);
- } else {
- source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
- result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
- sizeof(Py_UNICODE));
-
- if (result_buf == NULL)
- return PyErr_NoMemory();
-
+ return PyUnicode_Substring(self,
+ start, start + slicelength);
+ }
+ /* General case */
+ src_kind = PyUnicode_KIND(self);
+ src_data = PyUnicode_DATA(self);
+ if (!PyUnicode_IS_ASCII(self)) {
+ kind_limit = kind_maxchar_limit(src_kind);
+ max_char = 0;
for (cur = start, i = 0; i < slicelength; cur += step, i++) {
- result_buf[i] = source_buf[cur];
+ ch = PyUnicode_READ(src_kind, src_data, cur);
+ if (ch > max_char) {
+ max_char = ch;
+ if (max_char >= kind_limit)
+ break;
+ }
}
+ }
+ else
+ max_char = 127;
+ result = PyUnicode_New(slicelength, max_char);
+ if (result == NULL)
+ return NULL;
+ dest_kind = PyUnicode_KIND(result);
+ dest_data = PyUnicode_DATA(result);
- result = PyUnicode_FromUnicode(result_buf, slicelength);
- PyObject_FREE(result_buf);
- return result;
+ for (cur = start, i = 0; i < slicelength; cur += step, i++) {
+ Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
+ PyUnicode_WRITE(dest_kind, dest_data, i, ch);
}
+ assert(_PyUnicode_CheckConsistency(result, 1));
+ return result;
} else {
PyErr_SetString(PyExc_TypeError, "string indices must be integers");
return NULL;
if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
kwlist, &x, &encoding, &errors))
return NULL;
- if (x == NULL) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
+ if (x == NULL)
- return (PyObject *)_PyUnicode_New(0);
++ _Py_RETURN_UNICODE_EMPTY();
if (encoding == NULL && errors == NULL)
return PyObject_Str(x);
else
/* Initialize the Unicode implementation */
-void _PyUnicode_Init(void)
+int _PyUnicode_Init(void)
{
- int i;
-
/* XXX - move this array to unicodectype.c ? */
- Py_UNICODE linebreak[] = {
+ Py_UCS2 linebreak[] = {
0x000A, /* LINE FEED */
0x000D, /* CARRIAGE RETURN */
0x001C, /* FILE SEPARATOR */
};
/* Init the implementation */
- unicode_empty = PyUnicode_New(0, 0);
- if (!unicode_empty) {
- unicode_empty = _PyUnicode_New(0);
- if (!unicode_empty)
- return;
- }
++ _Py_INCREF_UNICODE_EMPTY();
+ if (!unicode_empty)
+ Py_FatalError("Can't create empty string");
- assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
++ Py_DECREF(unicode_empty);
- for (i = 0; i < 256; i++)
- unicode_latin1[i] = NULL;
if (PyType_Ready(&PyUnicode_Type) < 0)
Py_FatalError("Can't initialize 'unicode'");
{
int i;
- Py_XDECREF(unicode_empty);
- unicode_empty = NULL;
+ Py_CLEAR(unicode_empty);
- for (i = 0; i < 256; i++) {
- if (unicode_latin1[i]) {
- Py_DECREF(unicode_latin1[i]);
- unicode_latin1[i] = NULL;
- }
- }
+ for (i = 0; i < 256; i++)
+ Py_CLEAR(unicode_latin1[i]);
-
+ _PyUnicode_ClearStaticStrings();
(void)PyUnicode_ClearFreeList();
}