Issue #10156: In the interpreter's initialization phase, unicode globals

author Serhiy Storchaka <storchaka@gmail.com>

Sat, 26 Jan 2013 10:16:36 +0000 (12:16 +0200)

committer Serhiy Storchaka <storchaka@gmail.com>

Sat, 26 Jan 2013 10:16:36 +0000 (12:16 +0200)
author Serhiy Storchaka <storchaka@gmail.com>
Sat, 26 Jan 2013 10:16:36 +0000 (12:16 +0200)
committer Serhiy Storchaka <storchaka@gmail.com>
Sat, 26 Jan 2013 10:16:36 +0000 (12:16 +0200)
diff --cc Misc/NEWS

index 9d12d88e9840699c215c58eb5d2f88b534300291,c216cc9440068b51a1e592d0c7c17a9abc44a04f..108857bdfb407b82048ef92f993f378cdf73c9e7
--- 1/Misc/NEWS
--- 2/Misc/NEWS
+++ b/Misc/NEWS
@@@ -12,9 -10,9 +12,12 @@@ What's New in Python 3.3.1
   Core and Builtins
   -----------------
   
+ - Issue #10156: In the interpreter's initialization phase, unicode globals
+   are now initialized dynamically as needed.
+ 
+ +- Issue #16980: Fix processing of escaped non-ascii bytes in the
+ +  unicode-escape-decode decoder.
+ +
   - Issue #16975: Fix error handling bug in the escape-decode bytes decoder.
   
   - Issue #14850: Now a charmap decoder treats U+FFFE as "undefined mapping"
diff --cc Objects/unicodeobject.c

index a2ddf3e578f8f4e4b58d213dfb5ef8a5b906acef,92d17771e2adc1a7d66341a73de20a068f70d7cb..c96a91c3973218b3a30000e31e714eb725b1f1bb
--- 1/Objects/unicodeobject.c
--- 2/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@@ -179,17 -99,30 +180,36 @@@ extern "C" 
      Another way to look at this is that to say that the actual reference
      count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
   */
- static PyObject *interned;
+ static PyObject *interned = NULL;
   
- -/* Free list for Unicode objects */
- -static PyUnicodeObject *free_list = NULL;
- -static int numfree = 0;
- -
   /* The empty Unicode object is shared to improve performance. */
- static PyObject *unicode_empty;
- -static PyUnicodeObject *unicode_empty = NULL;
++static PyObject *unicode_empty = NULL;
+ 
- -#define _Py_RETURN_UNICODE_EMPTY()                      \
++#define _Py_INCREF_UNICODE_EMPTY()                      \
+     do {                                                \
+         if (unicode_empty != NULL)                      \
+             Py_INCREF(unicode_empty);                   \
+         else {                                          \
- -            unicode_empty = _PyUnicode_New(0);          \
- -            if (unicode_empty != NULL)                  \
++            unicode_empty = PyUnicode_New(0, 0);        \
++            if (unicode_empty != NULL) {                \
+                 Py_INCREF(unicode_empty);               \
++                assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
++            }                                           \
+         }                                               \
- -        return (PyObject *)unicode_empty;               \
+     } while (0)
+ 
++#define _Py_RETURN_UNICODE_EMPTY()                      \
++    do {                                                \
++        _Py_INCREF_UNICODE_EMPTY();                     \
++        return unicode_empty;                           \
++    } while (0)
+ +
+ +/* List of static strings. */
- static _Py_Identifier *static_strings;
++static _Py_Identifier *static_strings = NULL;
+ +
   /* Single character Unicode strings in the Latin-1 range are being
      shared as well. */
- static PyObject *unicode_latin1[256];
- -static PyUnicodeObject *unicode_latin1[256] = {NULL};
++static PyObject *unicode_latin1[256] = {NULL};
   
   /* Fast detection of the most frequent whitespace characters */
   const unsigned char _Py_ascii_whitespace[] = {
@@@ -290,224 -207,6 +310,223 @@@ PyUnicode_GetMax(void
   #endif
   }
   
-         Py_INCREF(unicode_empty);
+ +#ifdef Py_DEBUG
+ +int
+ +_PyUnicode_CheckConsistency(PyObject *op, int check_content)
+ +{
+ +    PyASCIIObject *ascii;
+ +    unsigned int kind;
+ +
+ +    assert(PyUnicode_Check(op));
+ +
+ +    ascii = (PyASCIIObject *)op;
+ +    kind = ascii->state.kind;
+ +
+ +    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
+ +        assert(kind == PyUnicode_1BYTE_KIND);
+ +        assert(ascii->state.ready == 1);
+ +    }
+ +    else {
+ +        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
+ +        void *data;
+ +
+ +        if (ascii->state.compact == 1) {
+ +            data = compact + 1;
+ +            assert(kind == PyUnicode_1BYTE_KIND
+ +                   || kind == PyUnicode_2BYTE_KIND
+ +                   || kind == PyUnicode_4BYTE_KIND);
+ +            assert(ascii->state.ascii == 0);
+ +            assert(ascii->state.ready == 1);
+ +            assert (compact->utf8 != data);
+ +        }
+ +        else {
+ +            PyUnicodeObject *unicode = (PyUnicodeObject *)op;
+ +
+ +            data = unicode->data.any;
+ +            if (kind == PyUnicode_WCHAR_KIND) {
+ +                assert(ascii->length == 0);
+ +                assert(ascii->hash == -1);
+ +                assert(ascii->state.compact == 0);
+ +                assert(ascii->state.ascii == 0);
+ +                assert(ascii->state.ready == 0);
+ +                assert(ascii->state.interned == SSTATE_NOT_INTERNED);
+ +                assert(ascii->wstr != NULL);
+ +                assert(data == NULL);
+ +                assert(compact->utf8 == NULL);
+ +            }
+ +            else {
+ +                assert(kind == PyUnicode_1BYTE_KIND
+ +                       || kind == PyUnicode_2BYTE_KIND
+ +                       || kind == PyUnicode_4BYTE_KIND);
+ +                assert(ascii->state.compact == 0);
+ +                assert(ascii->state.ready == 1);
+ +                assert(data != NULL);
+ +                if (ascii->state.ascii) {
+ +                    assert (compact->utf8 == data);
+ +                    assert (compact->utf8_length == ascii->length);
+ +                }
+ +                else
+ +                    assert (compact->utf8 != data);
+ +            }
+ +        }
+ +        if (kind != PyUnicode_WCHAR_KIND) {
+ +            if (
+ +#if SIZEOF_WCHAR_T == 2
+ +                kind == PyUnicode_2BYTE_KIND
+ +#else
+ +                kind == PyUnicode_4BYTE_KIND
+ +#endif
+ +               )
+ +            {
+ +                assert(ascii->wstr == data);
+ +                assert(compact->wstr_length == ascii->length);
+ +            } else
+ +                assert(ascii->wstr != data);
+ +        }
+ +
+ +        if (compact->utf8 == NULL)
+ +            assert(compact->utf8_length == 0);
+ +        if (ascii->wstr == NULL)
+ +            assert(compact->wstr_length == 0);
+ +    }
+ +    /* check that the best kind is used */
+ +    if (check_content && kind != PyUnicode_WCHAR_KIND)
+ +    {
+ +        Py_ssize_t i;
+ +        Py_UCS4 maxchar = 0;
+ +        void *data;
+ +        Py_UCS4 ch;
+ +
+ +        data = PyUnicode_DATA(ascii);
+ +        for (i=0; i < ascii->length; i++)
+ +        {
+ +            ch = PyUnicode_READ(kind, data, i);
+ +            if (ch > maxchar)
+ +                maxchar = ch;
+ +        }
+ +        if (kind == PyUnicode_1BYTE_KIND) {
+ +            if (ascii->state.ascii == 0) {
+ +                assert(maxchar >= 128);
+ +                assert(maxchar <= 255);
+ +            }
+ +            else
+ +                assert(maxchar < 128);
+ +        }
+ +        else if (kind == PyUnicode_2BYTE_KIND) {
+ +            assert(maxchar >= 0x100);
+ +            assert(maxchar <= 0xFFFF);
+ +        }
+ +        else {
+ +            assert(maxchar >= 0x10000);
+ +            assert(maxchar <= MAX_UNICODE);
+ +        }
+ +        assert(PyUnicode_READ(kind, data, ascii->length) == 0);
+ +    }
+ +    return 1;
+ +}
+ +#endif
+ +
+ +static PyObject*
+ +unicode_result_wchar(PyObject *unicode)
+ +{
+ +#ifndef Py_DEBUG
+ +    Py_ssize_t len;
+ +
+ +    assert(Py_REFCNT(unicode) == 1);
+ +
+ +    len = _PyUnicode_WSTR_LENGTH(unicode);
+ +    if (len == 0) {
-         return unicode_empty;
+ +        Py_DECREF(unicode);
-             Py_INCREF(unicode_empty);
++        _Py_RETURN_UNICODE_EMPTY();
+ +    }
+ +
+ +    if (len == 1) {
+ +        wchar_t ch = _PyUnicode_WSTR(unicode)[0];
+ +        if (ch < 256) {
+ +            PyObject *latin1_char = get_latin1_char((unsigned char)ch);
+ +            Py_DECREF(unicode);
+ +            return latin1_char;
+ +        }
+ +    }
+ +
+ +    if (_PyUnicode_Ready(unicode) < 0) {
+ +        Py_XDECREF(unicode);
+ +        return NULL;
+ +    }
+ +#else
+ +    /* don't make the result ready in debug mode to ensure that the caller
+ +       makes the string ready before using it */
+ +    assert(_PyUnicode_CheckConsistency(unicode, 1));
+ +#endif
+ +    return unicode;
+ +}
+ +
+ +static PyObject*
+ +unicode_result_ready(PyObject *unicode)
+ +{
+ +    Py_ssize_t length;
+ +
+ +    length = PyUnicode_GET_LENGTH(unicode);
+ +    if (length == 0) {
+ +        if (unicode != unicode_empty) {
+ +            Py_DECREF(unicode);
++            _Py_RETURN_UNICODE_EMPTY();
+ +        }
+ +        return unicode_empty;
+ +    }
+ +
+ +    if (length == 1) {
+ +        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
+ +        if (ch < 256) {
+ +            PyObject *latin1_char = unicode_latin1[ch];
+ +            if (latin1_char != NULL) {
+ +                if (unicode != latin1_char) {
+ +                    Py_INCREF(latin1_char);
+ +                    Py_DECREF(unicode);
+ +                }
+ +                return latin1_char;
+ +            }
+ +            else {
+ +                assert(_PyUnicode_CheckConsistency(unicode, 1));
+ +                Py_INCREF(unicode);
+ +                unicode_latin1[ch] = unicode;
+ +                return unicode;
+ +            }
+ +        }
+ +    }
+ +
+ +    assert(_PyUnicode_CheckConsistency(unicode, 1));
+ +    return unicode;
+ +}
+ +
+ +static PyObject*
+ +unicode_result(PyObject *unicode)
+ +{
+ +    assert(_PyUnicode_CHECK(unicode));
+ +    if (PyUnicode_IS_READY(unicode))
+ +        return unicode_result_ready(unicode);
+ +    else
+ +        return unicode_result_wchar(unicode);
+ +}
+ +
+ +static PyObject*
+ +unicode_result_unchanged(PyObject *unicode)
+ +{
+ +    if (PyUnicode_CheckExact(unicode)) {
+ +        if (PyUnicode_READY(unicode) == -1)
+ +            return NULL;
+ +        Py_INCREF(unicode);
+ +        return unicode;
+ +    }
+ +    else
+ +        /* Subtype -- return genuine unicode string with the same value. */
+ +        return _PyUnicode_Copy(unicode);
+ +}
+ +
+ +#ifdef HAVE_MBCS
+ +static OSVERSIONINFOEX winver;
+ +#endif
+ +
   /* --- Bloom Filters ----------------------------------------------------- */
   
   /* stuff to implement simple "bloom filters" for Unicode characters.
@@@ -1515,103 -418,36 +1534,105 @@@ unicode_dealloc(register PyObject *unic
           Py_FatalError("Inconsistent interned string state.");
       }
   
- -    if (PyUnicode_CheckExact(unicode) &&
- -        numfree < PyUnicode_MAXFREELIST) {
- -        /* Keep-Alive optimization */
- -        if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
- -            PyObject_DEL(unicode->str);
- -            unicode->str = NULL;
- -            unicode->length = 0;
- -        }
- -        if (unicode->defenc) {
- -            Py_CLEAR(unicode->defenc);
- -        }
- -        /* Add to free list */
- -        *(PyUnicodeObject **)unicode = free_list;
- -        free_list = unicode;
- -        numfree++;
- -    }
- -    else {
- -        PyObject_DEL(unicode->str);
- -        Py_XDECREF(unicode->defenc);
- -        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
+ +    if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
+ +        PyObject_DEL(_PyUnicode_WSTR(unicode));
+ +    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
+ +        PyObject_DEL(_PyUnicode_UTF8(unicode));
+ +    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
+ +        PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
+ +
+ +    Py_TYPE(unicode)->tp_free(unicode);
+ +}
+ +
+ +#ifdef Py_DEBUG
+ +static int
+ +unicode_is_singleton(PyObject *unicode)
+ +{
+ +    PyASCIIObject *ascii = (PyASCIIObject *)unicode;
+ +    if (unicode == unicode_empty)
+ +        return 1;
+ +    if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
+ +    {
+ +        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
+ +        if (ch < 256 && unicode_latin1[ch] == unicode)
+ +            return 1;
       }
+ +    return 0;
   }
+ +#endif
   
- -static
- -int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
+ +static int
+ +unicode_modifiable(PyObject *unicode)
   {
- -    register PyUnicodeObject *v;
+ +    assert(_PyUnicode_CHECK(unicode));
+ +    if (Py_REFCNT(unicode) != 1)
+ +        return 0;
+ +    if (_PyUnicode_HASH(unicode) != -1)
+ +        return 0;
+ +    if (PyUnicode_CHECK_INTERNED(unicode))
+ +        return 0;
+ +    if (!PyUnicode_CheckExact(unicode))
+ +        return 0;
+ +#ifdef Py_DEBUG
+ +    /* singleton refcount is greater than 1 */
+ +    assert(!unicode_is_singleton(unicode));
+ +#endif
+ +    return 1;
+ +}
   
- -    /* Argument checks */
- -    if (unicode == NULL) {
+ +static int
+ +unicode_resize(PyObject **p_unicode, Py_ssize_t length)
+ +{
+ +    PyObject *unicode;
+ +    Py_ssize_t old_length;
+ +
+ +    assert(p_unicode != NULL);
+ +    unicode = *p_unicode;
+ +
+ +    assert(unicode != NULL);
+ +    assert(PyUnicode_Check(unicode));
+ +    assert(0 <= length);
+ +
+ +    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
+ +        old_length = PyUnicode_WSTR_LENGTH(unicode);
+ +    else
+ +        old_length = PyUnicode_GET_LENGTH(unicode);
+ +    if (old_length == length)
+ +        return 0;
+ +
+ +    if (length == 0) {
++        _Py_INCREF_UNICODE_EMPTY();
++        if (!unicode_empty)
++            return -1;
+ +        Py_DECREF(*p_unicode);
+ +        *p_unicode = unicode_empty;
-         Py_INCREF(*p_unicode);
+ +        return 0;
+ +    }
+ +
+ +    if (!unicode_modifiable(unicode)) {
+ +        PyObject *copy = resize_copy(unicode, length);
+ +        if (copy == NULL)
+ +            return -1;
+ +        Py_DECREF(*p_unicode);
+ +        *p_unicode = copy;
+ +        return 0;
+ +    }
+ +
+ +    if (PyUnicode_IS_COMPACT(unicode)) {
+ +        PyObject *new_unicode = resize_compact(unicode, length);
+ +        if (new_unicode == NULL)
+ +            return -1;
+ +        *p_unicode = new_unicode;
+ +        return 0;
+ +    }
+ +    return resize_inplace(unicode, length);
+ +}
+ +
+ +int
+ +PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
+ +{
+ +    PyObject *unicode;
+ +    if (p_unicode == NULL) {
           PyErr_BadInternalCall();
           return -1;
       }
@@@ -1729,530 -489,165 +1750,520 @@@ PyUnicode_FromUnicode(const Py_UNICODE 
   
       /* If the Unicode data is known at construction time, we can apply
          some optimizations which share commonly used objects. */
- -    if (u != NULL) {
   
- -        /* Optimization for empty strings */
- -        if (size == 0)
- -            _Py_RETURN_UNICODE_EMPTY();
+ +    /* Optimization for empty strings */
-     if (size == 0 && unicode_empty != NULL) {
-         Py_INCREF(unicode_empty);
-         return unicode_empty;
-     }
++    if (size == 0)
++        _Py_RETURN_UNICODE_EMPTY();
   
- -        /* Single character Unicode objects in the Latin-1 range are
- -           shared when using this constructor */
- -        if (size == 1 && *u < 256) {
- -            unicode = unicode_latin1[*u];
- -            if (!unicode) {
- -                unicode = _PyUnicode_New(1);
- -                if (!unicode)
- -                    return NULL;
- -                unicode->str[0] = *u;
- -                unicode_latin1[*u] = unicode;
- -            }
- -            Py_INCREF(unicode);
- -            return (PyObject *)unicode;
- -        }
- -    }
+ +    /* Single character Unicode objects in the Latin-1 range are
+ +       shared when using this constructor */
+ +    if (size == 1 && *u < 256)
+ +        return get_latin1_char((unsigned char)*u);
+ +
+ +    /* If not empty and not single character, copy the Unicode data
+ +       into the new object */
+ +    if (find_maxchar_surrogates(u, u + size,
+ +                                &maxchar, &num_surrogates) == -1)
+ +        return NULL;
   
- -    unicode = _PyUnicode_New(size);
+ +    unicode = PyUnicode_New(size - num_surrogates, maxchar);
       if (!unicode)
           return NULL;
   
- -    /* Copy the Unicode data into the new object */
+ +    switch (PyUnicode_KIND(unicode)) {
+ +    case PyUnicode_1BYTE_KIND:
+ +        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
+ +                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
+ +        break;
+ +    case PyUnicode_2BYTE_KIND:
+ +#if Py_UNICODE_SIZE == 2
+ +        Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
+ +#else
+ +        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
+ +                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
+ +#endif
+ +        break;
+ +    case PyUnicode_4BYTE_KIND:
+ +#if SIZEOF_WCHAR_T == 2
+ +        /* This is the only case which has to process surrogates, thus
+ +           a simple copy loop is not enough and we need a function. */
+ +        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
+ +#else
+ +        assert(num_surrogates == 0);
+ +        Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
+ +#endif
+ +        break;
+ +    default:
+ +        assert(0 && "Impossible state");
+ +    }
+ +
+ +    return unicode_result(unicode);
+ +}
+ +
+ +PyObject *
+ +PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
+ +{
+ +    if (size < 0) {
+ +        PyErr_SetString(PyExc_SystemError,
+ +                        "Negative size passed to PyUnicode_FromStringAndSize");
+ +        return NULL;
+ +    }
       if (u != NULL)
- -        Py_UNICODE_COPY(unicode->str, u, size);
+ +        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
+ +    else
+ +        return (PyObject *)_PyUnicode_New(size);
+ +}
   
- -    return (PyObject *)unicode;
+ +PyObject *
+ +PyUnicode_FromString(const char *u)
+ +{
+ +    size_t size = strlen(u);
+ +    if (size > PY_SSIZE_T_MAX) {
+ +        PyErr_SetString(PyExc_OverflowError, "input too long");
+ +        return NULL;
+ +    }
+ +    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
+ +}
+ +
+ +PyObject *
+ +_PyUnicode_FromId(_Py_Identifier *id)
+ +{
+ +    if (!id->object) {
+ +        id->object = PyUnicode_DecodeUTF8Stateful(id->string,
+ +                                                  strlen(id->string),
+ +                                                  NULL, NULL);
+ +        if (!id->object)
+ +            return NULL;
+ +        PyUnicode_InternInPlace(&id->object);
+ +        assert(!id->next);
+ +        id->next = static_strings;
+ +        static_strings = id;
+ +    }
+ +    return id->object;
+ +}
+ +
+ +void
+ +_PyUnicode_ClearStaticStrings()
+ +{
+ +    _Py_Identifier *tmp, *s = static_strings;
+ +    while (s) {
+ +        Py_DECREF(s->object);
+ +        s->object = NULL;
+ +        tmp = s->next;
+ +        s->next = NULL;
+ +        s = tmp;
+ +    }
+ +    static_strings = NULL;
+ +}
+ +
+ +/* Internal function, doesn't check maximum character */
+ +
+ +PyObject*
+ +_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
+ +{
+ +    const unsigned char *s = (const unsigned char *)buffer;
+ +    PyObject *unicode;
+ +    if (size == 1) {
+ +#ifdef Py_DEBUG
+ +        assert(s[0] < 128);
+ +#endif
+ +        return get_latin1_char(s[0]);
+ +    }
+ +    unicode = PyUnicode_New(size, 127);
+ +    if (!unicode)
+ +        return NULL;
+ +    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
+ +    assert(_PyUnicode_CheckConsistency(unicode, 1));
+ +    return unicode;
+ +}
+ +
+ +static Py_UCS4
+ +kind_maxchar_limit(unsigned int kind)
+ +{
+ +    switch (kind) {
+ +    case PyUnicode_1BYTE_KIND:
+ +        return 0x80;
+ +    case PyUnicode_2BYTE_KIND:
+ +        return 0x100;
+ +    case PyUnicode_4BYTE_KIND:
+ +        return 0x10000;
+ +    default:
+ +        assert(0 && "invalid kind");
+ +        return MAX_UNICODE;
+ +    }
+ +}
+ +
+ +Py_LOCAL_INLINE(Py_UCS4)
+ +align_maxchar(Py_UCS4 maxchar)
+ +{
+ +    if (maxchar <= 127)
+ +        return 127;
+ +    else if (maxchar <= 255)
+ +        return 255;
+ +    else if (maxchar <= 65535)
+ +        return 65535;
+ +    else
+ +        return MAX_UNICODE;
+ +}
+ +
+ +static PyObject*
+ +_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
+ +{
+ +    PyObject *res;
+ +    unsigned char max_char;
+ +
-     if (size == 0) {
-         Py_INCREF(unicode_empty);
-         return unicode_empty;
-     }
++    if (size == 0)
++        _Py_RETURN_UNICODE_EMPTY();
+ +    assert(size > 0);
+ +    if (size == 1)
+ +        return get_latin1_char(u[0]);
+ +
+ +    max_char = ucs1lib_find_max_char(u, u + size);
+ +    res = PyUnicode_New(size, max_char);
+ +    if (!res)
+ +        return NULL;
+ +    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
+ +    assert(_PyUnicode_CheckConsistency(res, 1));
+ +    return res;
+ +}
+ +
+ +static PyObject*
+ +_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
+ +{
+ +    PyObject *res;
+ +    Py_UCS2 max_char;
+ +
-     if (size == 0) {
-         Py_INCREF(unicode_empty);
-         return unicode_empty;
-     }
++    if (size == 0)
++        _Py_RETURN_UNICODE_EMPTY();
+ +    assert(size > 0);
+ +    if (size == 1) {
+ +        Py_UCS4 ch = u[0];
+ +        if (ch < 256)
+ +            return get_latin1_char((unsigned char)ch);
+ +
+ +        res = PyUnicode_New(1, ch);
+ +        if (res == NULL)
+ +            return NULL;
+ +        PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
+ +        assert(_PyUnicode_CheckConsistency(res, 1));
+ +        return res;
+ +    }
+ +
+ +    max_char = ucs2lib_find_max_char(u, u + size);
+ +    res = PyUnicode_New(size, max_char);
+ +    if (!res)
+ +        return NULL;
+ +    if (max_char >= 256)
+ +        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
+ +    else {
+ +        _PyUnicode_CONVERT_BYTES(
+ +            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
+ +    }
+ +    assert(_PyUnicode_CheckConsistency(res, 1));
+ +    return res;
+ +}
+ +
+ +static PyObject*
+ +_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
+ +{
+ +    PyObject *res;
+ +    Py_UCS4 max_char;
+ +
-     if (size == 0) {
-         Py_INCREF(unicode_empty);
-         return unicode_empty;
-     }
++    if (size == 0)
++        _Py_RETURN_UNICODE_EMPTY();
+ +    assert(size > 0);
+ +    if (size == 1) {
+ +        Py_UCS4 ch = u[0];
+ +        if (ch < 256)
+ +            return get_latin1_char((unsigned char)ch);
+ +
+ +        res = PyUnicode_New(1, ch);
+ +        if (res == NULL)
+ +            return NULL;
+ +        PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
+ +        assert(_PyUnicode_CheckConsistency(res, 1));
+ +        return res;
+ +    }
+ +
+ +    max_char = ucs4lib_find_max_char(u, u + size);
+ +    res = PyUnicode_New(size, max_char);
+ +    if (!res)
+ +        return NULL;
+ +    if (max_char < 256)
+ +        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
+ +                                 PyUnicode_1BYTE_DATA(res));
+ +    else if (max_char < 0x10000)
+ +        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
+ +                                 PyUnicode_2BYTE_DATA(res));
+ +    else
+ +        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
+ +    assert(_PyUnicode_CheckConsistency(res, 1));
+ +    return res;
   }
   
- -PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
+ +PyObject*
+ +PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
   {
- -    PyUnicodeObject *unicode;
- -
       if (size < 0) {
- -        PyErr_SetString(PyExc_SystemError,
- -                        "Negative size passed to PyUnicode_FromStringAndSize");
+ +        PyErr_SetString(PyExc_ValueError, "size must be positive");
+ +        return NULL;
+ +    }
+ +    switch (kind) {
+ +    case PyUnicode_1BYTE_KIND:
+ +        return _PyUnicode_FromUCS1(buffer, size);
+ +    case PyUnicode_2BYTE_KIND:
+ +        return _PyUnicode_FromUCS2(buffer, size);
+ +    case PyUnicode_4BYTE_KIND:
+ +        return _PyUnicode_FromUCS4(buffer, size);
+ +    default:
+ +        PyErr_SetString(PyExc_SystemError, "invalid kind");
           return NULL;
       }
+ +}
   
- -    /* If the Unicode data is known at construction time, we can apply
- -       some optimizations which share commonly used objects.
- -       Also, this means the input must be UTF-8, so fall back to the
- -       UTF-8 decoder at the end. */
- -    if (u != NULL) {
+ +Py_UCS4
+ +_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
+ +{
+ +    enum PyUnicode_Kind kind;
+ +    void *startptr, *endptr;
   
- -        /* Optimization for empty strings */
- -        if (size == 0)
- -            _Py_RETURN_UNICODE_EMPTY();
+ +    assert(PyUnicode_IS_READY(unicode));
+ +    assert(0 <= start);
+ +    assert(end <= PyUnicode_GET_LENGTH(unicode));
+ +    assert(start <= end);
   
- -        /* Single characters are shared when using this constructor.
- -           Restrict to ASCII, since the input must be UTF-8. */
- -        if (size == 1 && Py_CHARMASK(*u) < 128) {
- -            unicode = unicode_latin1[Py_CHARMASK(*u)];
- -            if (!unicode) {
- -                unicode = _PyUnicode_New(1);
- -                if (!unicode)
- -                    return NULL;
- -                unicode->str[0] = Py_CHARMASK(*u);
- -                unicode_latin1[Py_CHARMASK(*u)] = unicode;
- -            }
- -            Py_INCREF(unicode);
- -            return (PyObject *)unicode;
- -        }
+ +    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
+ +        return PyUnicode_MAX_CHAR_VALUE(unicode);
   
- -        return PyUnicode_DecodeUTF8(u, size, NULL);
+ +    if (start == end)
+ +        return 127;
+ +
+ +    if (PyUnicode_IS_ASCII(unicode))
+ +        return 127;
+ +
+ +    kind = PyUnicode_KIND(unicode);
+ +    startptr = PyUnicode_DATA(unicode);
+ +    endptr = (char *)startptr + end * kind;
+ +    startptr = (char *)startptr + start * kind;
+ +    switch(kind) {
+ +    case PyUnicode_1BYTE_KIND:
+ +        return ucs1lib_find_max_char(startptr, endptr);
+ +    case PyUnicode_2BYTE_KIND:
+ +        return ucs2lib_find_max_char(startptr, endptr);
+ +    case PyUnicode_4BYTE_KIND:
+ +        return ucs4lib_find_max_char(startptr, endptr);
+ +    default:
+ +        assert(0);
+ +        return 0;
       }
+ +}
   
- -    unicode = _PyUnicode_New(size);
- -    if (!unicode)
- -        return NULL;
+ +/* Ensure that a string uses the most efficient storage, if it is not the
+ +   case: create a new string with of the right kind. Write NULL into *p_unicode
+ +   on error. */
+ +static void
+ +unicode_adjust_maxchar(PyObject **p_unicode)
+ +{
+ +    PyObject *unicode, *copy;
+ +    Py_UCS4 max_char;
+ +    Py_ssize_t len;
+ +    unsigned int kind;
+ +
+ +    assert(p_unicode != NULL);
+ +    unicode = *p_unicode;
+ +    assert(PyUnicode_IS_READY(unicode));
+ +    if (PyUnicode_IS_ASCII(unicode))
+ +        return;
   
- -    return (PyObject *)unicode;
+ +    len = PyUnicode_GET_LENGTH(unicode);
+ +    kind = PyUnicode_KIND(unicode);
+ +    if (kind == PyUnicode_1BYTE_KIND) {
+ +        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
+ +        max_char = ucs1lib_find_max_char(u, u + len);
+ +        if (max_char >= 128)
+ +            return;
+ +    }
+ +    else if (kind == PyUnicode_2BYTE_KIND) {
+ +        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
+ +        max_char = ucs2lib_find_max_char(u, u + len);
+ +        if (max_char >= 256)
+ +            return;
+ +    }
+ +    else {
+ +        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
+ +        assert(kind == PyUnicode_4BYTE_KIND);
+ +        max_char = ucs4lib_find_max_char(u, u + len);
+ +        if (max_char >= 0x10000)
+ +            return;
+ +    }
+ +    copy = PyUnicode_New(len, max_char);
+ +    if (copy != NULL)
+ +        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
+ +    Py_DECREF(unicode);
+ +    *p_unicode = copy;
   }
   
- -PyObject *PyUnicode_FromString(const char *u)
+ +PyObject*
+ +_PyUnicode_Copy(PyObject *unicode)
   {
- -    size_t size = strlen(u);
- -    if (size > PY_SSIZE_T_MAX) {
- -        PyErr_SetString(PyExc_OverflowError, "input too long");
+ +    Py_ssize_t length;
+ +    PyObject *copy;
+ +
+ +    if (!PyUnicode_Check(unicode)) {
+ +        PyErr_BadInternalCall();
           return NULL;
       }
+ +    if (PyUnicode_READY(unicode) == -1)
+ +        return NULL;
   
- -    return PyUnicode_FromStringAndSize(u, size);
- -}
- -
- -#ifdef HAVE_WCHAR_H
+ +    length = PyUnicode_GET_LENGTH(unicode);
+ +    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
+ +    if (!copy)
+ +        return NULL;
+ +    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
   
- -#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
- -# define CONVERT_WCHAR_TO_SURROGATES
- -#endif
+ +    Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
+ +              length * PyUnicode_KIND(unicode));
+ +    assert(_PyUnicode_CheckConsistency(copy, 1));
+ +    return copy;
+ +}
   
- -#ifdef CONVERT_WCHAR_TO_SURROGATES
   
- -/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
- -   to convert from UTF32 to UTF16. */
+ +/* Widen Unicode objects to larger buffers. Don't write terminating null
+ +   character. Return NULL on error. */
   
- -PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
- -                                 Py_ssize_t size)
+ +void*
+ +_PyUnicode_AsKind(PyObject *s, unsigned int kind)
   {
- -    PyUnicodeObject *unicode;
- -    register Py_ssize_t i;
- -    Py_ssize_t alloc;
- -    const wchar_t *orig_w;
+ +    Py_ssize_t len;
+ +    void *result;
+ +    unsigned int skind;
   
- -    if (w == NULL) {
- -        if (size == 0)
- -            return PyUnicode_FromStringAndSize(NULL, 0);
- -        PyErr_BadInternalCall();
+ +    if (PyUnicode_READY(s) == -1)
           return NULL;
- -    }
   
- -    if (size == -1) {
- -        size = wcslen(w);
+ +    len = PyUnicode_GET_LENGTH(s);
+ +    skind = PyUnicode_KIND(s);
+ +    if (skind >= kind) {
+ +        PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
+ +        return NULL;
       }
- -
- -    alloc = size;
- -    orig_w = w;
- -    for (i = size; i > 0; i--) {
- -        if (*w > 0xFFFF)
- -            alloc++;
- -        w++;
+ +    switch (kind) {
+ +    case PyUnicode_2BYTE_KIND:
+ +        result = PyMem_Malloc(len * sizeof(Py_UCS2));
+ +        if (!result)
+ +            return PyErr_NoMemory();
+ +        assert(skind == PyUnicode_1BYTE_KIND);
+ +        _PyUnicode_CONVERT_BYTES(
+ +            Py_UCS1, Py_UCS2,
+ +            PyUnicode_1BYTE_DATA(s),
+ +            PyUnicode_1BYTE_DATA(s) + len,
+ +            result);
+ +        return result;
+ +    case PyUnicode_4BYTE_KIND:
+ +        result = PyMem_Malloc(len * sizeof(Py_UCS4));
+ +        if (!result)
+ +            return PyErr_NoMemory();
+ +        if (skind == PyUnicode_2BYTE_KIND) {
+ +            _PyUnicode_CONVERT_BYTES(
+ +                Py_UCS2, Py_UCS4,
+ +                PyUnicode_2BYTE_DATA(s),
+ +                PyUnicode_2BYTE_DATA(s) + len,
+ +                result);
+ +        }
+ +        else {
+ +            assert(skind == PyUnicode_1BYTE_KIND);
+ +            _PyUnicode_CONVERT_BYTES(
+ +                Py_UCS1, Py_UCS4,
+ +                PyUnicode_1BYTE_DATA(s),
+ +                PyUnicode_1BYTE_DATA(s) + len,
+ +                result);
+ +        }
+ +        return result;
+ +    default:
+ +        break;
       }
- -    w = orig_w;
- -    unicode = _PyUnicode_New(alloc);
- -    if (!unicode)
- -        return NULL;
+ +    PyErr_SetString(PyExc_SystemError, "invalid kind");
+ +    return NULL;
+ +}
   
- -    /* Copy the wchar_t data into the new object */
- -    {
- -        register Py_UNICODE *u;
- -        u = PyUnicode_AS_UNICODE(unicode);
- -        for (i = size; i > 0; i--) {
- -            if (*w > 0xFFFF) {
- -                wchar_t ordinal = *w++;
- -                ordinal -= 0x10000;
- -                *u++ = 0xD800 | (ordinal >> 10);
- -                *u++ = 0xDC00 | (ordinal & 0x3FF);
- -            }
- -            else
- -                *u++ = *w++;
+ +static Py_UCS4*
+ +as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
+ +        int copy_null)
+ +{
+ +    int kind;
+ +    void *data;
+ +    Py_ssize_t len, targetlen;
+ +    if (PyUnicode_READY(string) == -1)
+ +        return NULL;
+ +    kind = PyUnicode_KIND(string);
+ +    data = PyUnicode_DATA(string);
+ +    len = PyUnicode_GET_LENGTH(string);
+ +    targetlen = len;
+ +    if (copy_null)
+ +        targetlen++;
+ +    if (!target) {
+ +        if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
+ +            PyErr_NoMemory();
+ +            return NULL;
+ +        }
+ +        target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
+ +        if (!target) {
+ +            PyErr_NoMemory();
+ +            return NULL;
           }
       }
- -    return (PyObject *)unicode;
+ +    else {
+ +        if (targetsize < targetlen) {
+ +            PyErr_Format(PyExc_SystemError,
+ +                         "string is longer than the buffer");
+ +            if (copy_null && 0 < targetsize)
+ +                target[0] = 0;
+ +            return NULL;
+ +        }
+ +    }
+ +    if (kind == PyUnicode_1BYTE_KIND) {
+ +        Py_UCS1 *start = (Py_UCS1 *) data;
+ +        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
+ +    }
+ +    else if (kind == PyUnicode_2BYTE_KIND) {
+ +        Py_UCS2 *start = (Py_UCS2 *) data;
+ +        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
+ +    }
+ +    else {
+ +        assert(kind == PyUnicode_4BYTE_KIND);
+ +        Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
+ +    }
+ +    if (copy_null)
+ +        target[len] = 0;
+ +    return target;
   }
   
- -#else
+ +Py_UCS4*
+ +PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
+ +                 int copy_null)
+ +{
+ +    if (target == NULL || targetsize < 0) {
+ +        PyErr_BadInternalCall();
+ +        return NULL;
+ +    }
+ +    return as_ucs4(string, target, targetsize, copy_null);
+ +}
   
- -PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
- -                                 Py_ssize_t size)
+ +Py_UCS4*
+ +PyUnicode_AsUCS4Copy(PyObject *string)
   {
- -    PyUnicodeObject *unicode;
+ +    return as_ucs4(string, NULL, 0, 1);
+ +}
   
+ +#ifdef HAVE_WCHAR_H
+ +
+ +PyObject *
+ +PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
+ +{
       if (w == NULL) {
-         if (size == 0) {
-             Py_INCREF(unicode_empty);
-             return unicode_empty;
-         }
+         if (size == 0)
- -            return PyUnicode_FromStringAndSize(NULL, 0);
++            _Py_RETURN_UNICODE_EMPTY();
           PyErr_BadInternalCall();
           return NULL;
       }
@@@ -4720,68 -2590,181 +4726,67 @@@ PyUnicode_DecodeUTF8Stateful(const cha
       if (size == 0) {
           if (consumed)
               *consumed = 0;
-         Py_INCREF(unicode_empty);
-         return unicode_empty;
- -        return (PyObject *)unicode;
++        _Py_RETURN_UNICODE_EMPTY();
       }
   
- -    /* Unpack UTF-8 encoded data */
- -    p = unicode->str;
- -    e = s + size;
- -    aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
- -
- -    while (s < e) {
- -        Py_UCS4 ch = (unsigned char)*s;
- -
- -        if (ch < 0x80) {
- -            /* Fast path for runs of ASCII characters. Given that common UTF-8
- -               input will consist of an overwhelming majority of ASCII
- -               characters, we try to optimize for this case by checking
- -               as many characters as a C 'long' can contain.
- -               First, check if we can do an aligned read, as most CPUs have
- -               a penalty for unaligned reads.
- -            */
- -            if (!((size_t) s & LONG_PTR_MASK)) {
- -                /* Help register allocation */
- -                register const char *_s = s;
- -                register Py_UNICODE *_p = p;
- -                while (_s < aligned_end) {
- -                    /* Read a whole long at a time (either 4 or 8 bytes),
- -                       and do a fast unrolled copy if it only contains ASCII
- -                       characters. */
- -                    unsigned long data = *(unsigned long *) _s;
- -                    if (data & ASCII_CHAR_MASK)
- -                        break;
- -                    _p[0] = (unsigned char) _s[0];
- -                    _p[1] = (unsigned char) _s[1];
- -                    _p[2] = (unsigned char) _s[2];
- -                    _p[3] = (unsigned char) _s[3];
- -#if (SIZEOF_LONG == 8)
- -                    _p[4] = (unsigned char) _s[4];
- -                    _p[5] = (unsigned char) _s[5];
- -                    _p[6] = (unsigned char) _s[6];
- -                    _p[7] = (unsigned char) _s[7];
- -#endif
- -                    _s += SIZEOF_LONG;
- -                    _p += SIZEOF_LONG;
- -                }
- -                s = _s;
- -                p = _p;
- -                if (s == e)
- -                    break;
- -                ch = (unsigned char)*s;
- -            }
- -        }
- -
- -        if (ch < 0x80) {
- -            *p++ = (Py_UNICODE)ch;
- -            s++;
- -            continue;
- -        }
+ +    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
+ +    if (size == 1 && (unsigned char)s[0] < 128) {
+ +        if (consumed)
+ +            *consumed = 1;
+ +        return get_latin1_char((unsigned char)s[0]);
+ +    }
   
- -        n = utf8_code_length[ch];
+ +    unicode = PyUnicode_New(size, 127);
+ +    if (!unicode)
+ +        return NULL;
   
- -        if (s + n > e) {
- -            if (consumed)
- -                break;
- -            else {
- -                errmsg = "unexpected end of data";
- -                startinpos = s-starts;
- -                endinpos = startinpos+1;
- -                for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
- -                    endinpos++;
- -                goto utf8Error;
- -            }
+ +    outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
+ +    s += outpos;
+ +    while (s < end) {
+ +        Py_UCS4 ch;
+ +        int kind = PyUnicode_KIND(unicode);
+ +        if (kind == PyUnicode_1BYTE_KIND) {
+ +            if (PyUnicode_IS_ASCII(unicode))
+ +                ch = asciilib_utf8_decode(&s, end,
+ +                        PyUnicode_1BYTE_DATA(unicode), &outpos);
+ +            else
+ +                ch = ucs1lib_utf8_decode(&s, end,
+ +                        PyUnicode_1BYTE_DATA(unicode), &outpos);
+ +        } else if (kind == PyUnicode_2BYTE_KIND) {
+ +            ch = ucs2lib_utf8_decode(&s, end,
+ +                    PyUnicode_2BYTE_DATA(unicode), &outpos);
+ +        } else {
+ +            assert(kind == PyUnicode_4BYTE_KIND);
+ +            ch = ucs4lib_utf8_decode(&s, end,
+ +                    PyUnicode_4BYTE_DATA(unicode), &outpos);
           }
   
- -        switch (n) {
- -
+ +        switch (ch) {
           case 0:
- -            errmsg = "invalid start byte";
- -            startinpos = s-starts;
- -            endinpos = startinpos+1;
- -            goto utf8Error;
- -
+ +            if (s == end || consumed)
+ +                goto End;
+ +            errmsg = "unexpected end of data";
+ +            startinpos = s - starts;
+ +            endinpos = end - starts;
+ +            break;
           case 1:
- -            errmsg = "internal error";
- -            startinpos = s-starts;
- -            endinpos = startinpos+1;
- -            goto utf8Error;
- -
- -        case 2:
- -            if ((s[1] & 0xc0) != 0x80) {
- -                errmsg = "invalid continuation byte";
- -                startinpos = s-starts;
- -                endinpos = startinpos + 1;
- -                goto utf8Error;
- -            }
- -            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
- -            assert ((ch > 0x007F) && (ch <= 0x07FF));
- -            *p++ = (Py_UNICODE)ch;
+ +            errmsg = "invalid start byte";
+ +            startinpos = s - starts;
+ +            endinpos = startinpos + 1;
               break;
- -
+ +        case 2:
           case 3:
- -            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
- -               will result in surrogates in range d800-dfff. Surrogates are
- -               not valid UTF-8 so they are rejected.
- -               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
- -               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
- -            if ((s[1] & 0xc0) != 0x80 ||
- -                (s[2] & 0xc0) != 0x80 ||
- -                ((unsigned char)s[0] == 0xE0 &&
- -                 (unsigned char)s[1] < 0xA0) ||
- -                ((unsigned char)s[0] == 0xED &&
- -                 (unsigned char)s[1] > 0x9F)) {
- -                errmsg = "invalid continuation byte";
- -                startinpos = s-starts;
- -                endinpos = startinpos + 1;
- -
- -                /* if s[1] first two bits are 1 and 0, then the invalid
- -                   continuation byte is s[2], so increment endinpos by 1,
- -                   if not, s[1] is invalid and endinpos doesn't need to
- -                   be incremented. */
- -                if ((s[1] & 0xC0) == 0x80)
- -                    endinpos++;
- -                goto utf8Error;
- -            }
- -            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
- -            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
- -            *p++ = (Py_UNICODE)ch;
- -            break;
- -
           case 4:
- -            if ((s[1] & 0xc0) != 0x80 ||
- -                (s[2] & 0xc0) != 0x80 ||
- -                (s[3] & 0xc0) != 0x80 ||
- -                ((unsigned char)s[0] == 0xF0 &&
- -                 (unsigned char)s[1] < 0x90) ||
- -                ((unsigned char)s[0] == 0xF4 &&
- -                 (unsigned char)s[1] > 0x8F)) {
- -                errmsg = "invalid continuation byte";
- -                startinpos = s-starts;
- -                endinpos = startinpos + 1;
- -                if ((s[1] & 0xC0) == 0x80) {
- -                    endinpos++;
- -                    if ((s[2] & 0xC0) == 0x80)
- -                        endinpos++;
- -                }
- -                goto utf8Error;
- -            }
- -            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
- -                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
- -            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
- -
- -#ifdef Py_UNICODE_WIDE
- -            *p++ = (Py_UNICODE)ch;
- -#else
- -            /*  compute and append the two surrogates: */
- -
- -            /*  translate from 10000..10FFFF to 0..FFFF */
- -            ch -= 0x10000;
- -
- -            /*  high surrogate = top 10 bits added to D800 */
- -            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
- -
- -            /*  low surrogate = bottom 10 bits added to DC00 */
- -            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
- -#endif
+ +            errmsg = "invalid continuation byte";
+ +            startinpos = s - starts;
+ +            endinpos = startinpos + ch - 1;
               break;
+ +        default:
+ +            if (unicode_putchar(&unicode, &outpos, ch) < 0)
+ +                goto onError;
+ +            continue;
           }
- -        s += n;
- -        continue;
   
- -      utf8Error:
- -        outpos = p-PyUnicode_AS_UNICODE(unicode);
           if (unicode_decode_call_errorhandler(
                   errors, &errorHandler,
                   "utf-8", errmsg,
@@@ -5215,71 -3439,54 +5220,70 @@@ PyUnicode_DecodeUTF16Stateful(const cha
          byte order setting accordingly. In native mode, the leading BOM
          mark is skipped, in all other modes, it is copied to the output
          stream as-is (giving a ZWNBSP character). */
- -    if (bo == 0) {
- -        if (size >= 2) {
- -            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
- -#ifdef BYTEORDER_IS_LITTLE_ENDIAN
- -            if (bom == 0xFEFF) {
- -                q += 2;
- -                bo = -1;
- -            }
- -            else if (bom == 0xFFFE) {
- -                q += 2;
- -                bo = 1;
- -            }
- -#else
- -            if (bom == 0xFEFF) {
- -                q += 2;
- -                bo = 1;
- -            }
- -            else if (bom == 0xFFFE) {
- -                q += 2;
- -                bo = -1;
- -            }
- -#endif
+ +    if (bo == 0 && size >= 2) {
+ +        const Py_UCS4 bom = (q[1] << 8) | q[0];
+ +        if (bom == 0xFEFF) {
+ +            q += 2;
+ +            bo = -1;
+ +        }
+ +        else if (bom == 0xFFFE) {
+ +            q += 2;
+ +            bo = 1;
           }
+ +        if (byteorder)
+ +            *byteorder = bo;
       }
   
- -    if (bo == -1) {
- -        /* force LE */
- -        ihi = 1;
- -        ilo = 0;
- -    }
- -    else if (bo == 1) {
- -        /* force BE */
- -        ihi = 0;
- -        ilo = 1;
+ +    if (q == e) {
+ +        if (consumed)
+ +            *consumed = size;
-         Py_INCREF(unicode_empty);
-         return unicode_empty;
++        _Py_RETURN_UNICODE_EMPTY();
       }
+ +
   #ifdef BYTEORDER_IS_LITTLE_ENDIAN
- -    native_ordering = ilo < ihi;
+ +    native_ordering = bo <= 0;
   #else
- -    native_ordering = ilo > ihi;
+ +    native_ordering = bo >= 0;
   #endif
   
- -    aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
+ +    /* Note: size will always be longer than the resulting Unicode
+ +       character count */
+ +    unicode = PyUnicode_New((e - q + 1) / 2, 127);
+ +    if (!unicode)
+ +        return NULL;
+ +
+ +    outpos = 0;
       while (1) {
- -        Py_UNICODE ch;
- -        if (e - q < 2) {
+ +        Py_UCS4 ch = 0;
+ +        if (e - q >= 2) {
+ +            int kind = PyUnicode_KIND(unicode);
+ +            if (kind == PyUnicode_1BYTE_KIND) {
+ +                if (PyUnicode_IS_ASCII(unicode))
+ +                    ch = asciilib_utf16_decode(&q, e,
+ +                            PyUnicode_1BYTE_DATA(unicode), &outpos,
+ +                            native_ordering);
+ +                else
+ +                    ch = ucs1lib_utf16_decode(&q, e,
+ +                            PyUnicode_1BYTE_DATA(unicode), &outpos,
+ +                            native_ordering);
+ +            } else if (kind == PyUnicode_2BYTE_KIND) {
+ +                ch = ucs2lib_utf16_decode(&q, e,
+ +                        PyUnicode_2BYTE_DATA(unicode), &outpos,
+ +                        native_ordering);
+ +            } else {
+ +                assert(kind == PyUnicode_4BYTE_KIND);
+ +                ch = ucs4lib_utf16_decode(&q, e,
+ +                        PyUnicode_4BYTE_DATA(unicode), &outpos,
+ +                        native_ordering);
+ +            }
+ +        }
+ +
+ +        switch (ch)
+ +        {
+ +        case 0:
               /* remaining byte at the end? (size should be even) */
               if (q == e || consumed)
- -                break;
+ +                goto End;
               errmsg = "truncated data";
               startinpos = ((const char *)q) - starts;
               endinpos = ((const char *)e) - starts;
@@@ -6558,27 -4831,19 +6562,25 @@@ PyUnicode_DecodeASCII(const char *s
       PyObject *errorHandler = NULL;
       PyObject *exc = NULL;
   
-     if (size == 0) {
-         Py_INCREF(unicode_empty);
-         return unicode_empty;
-     }
++    if (size == 0)
++        _Py_RETURN_UNICODE_EMPTY();
+ +
       /* ASCII is equivalent to the first 128 ordinals in Unicode. */
- -    if (size == 1 && *(unsigned char*)s < 128) {
- -        Py_UNICODE r = *(unsigned char*)s;
- -        return PyUnicode_FromUnicode(&r, 1);
- -    }
+ +    if (size == 1 && (unsigned char)s[0] < 128)
+ +        return get_latin1_char((unsigned char)s[0]);
   
- -    v = _PyUnicode_New(size);
- -    if (v == NULL)
+ +    unicode = PyUnicode_New(size, 127);
+ +    if (unicode == NULL)
           goto onError;
- -    if (size == 0)
- -        return (PyObject *)v;
- -    p = PyUnicode_AS_UNICODE(v);
+ +
       e = s + size;
+ +    data = PyUnicode_1BYTE_DATA(unicode);
+ +    outpos = ascii_decode(s, e, (Py_UCS1 *)data);
+ +    if (outpos == size)
+ +        return unicode;
+ +
+ +    s += outpos;
+ +    kind = PyUnicode_1BYTE_KIND;
       while (s < e) {
           register unsigned char c = (unsigned char)*s;
           if (c < 128) {
@@@ -6658,365 -4902,20 +6660,364 @@@ PyUnicode_AsASCIIString(PyObject *unico
   #define NEED_RETRY
   #endif
   
- -/* XXX This code is limited to "true" double-byte encodings, as
- -   a) it assumes an incomplete character consists of a single byte, and
- -   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
- -   encodings, see IsDBCSLeadByteEx documentation. */
+ +#ifndef WC_ERR_INVALID_CHARS
+ +#  define WC_ERR_INVALID_CHARS 0x0080
+ +#endif
+ +
+ +static char*
+ +code_page_name(UINT code_page, PyObject **obj)
+ +{
+ +    *obj = NULL;
+ +    if (code_page == CP_ACP)
+ +        return "mbcs";
+ +    if (code_page == CP_UTF7)
+ +        return "CP_UTF7";
+ +    if (code_page == CP_UTF8)
+ +        return "CP_UTF8";
+ +
+ +    *obj = PyBytes_FromFormat("cp%u", code_page);
+ +    if (*obj == NULL)
+ +        return NULL;
+ +    return PyBytes_AS_STRING(*obj);
+ +}
+ +
+ +static int
+ +is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
+ +{
+ +    const char *curr = s + offset;
+ +    const char *prev;
+ +
+ +    if (!IsDBCSLeadByteEx(code_page, *curr))
+ +        return 0;
+ +
+ +    prev = CharPrevExA(code_page, s, curr, 0);
+ +    if (prev == curr)
+ +        return 1;
+ +    /* FIXME: This code is limited to "true" double-byte encodings,
+ +       as it assumes an incomplete character consists of a single
+ +       byte. */
+ +    if (curr - prev == 2)
+ +        return 1;
+ +    if (!IsDBCSLeadByteEx(code_page, *prev))
+ +        return 1;
+ +    return 0;
+ +}
+ +
+ +static DWORD
+ +decode_code_page_flags(UINT code_page)
+ +{
+ +    if (code_page == CP_UTF7) {
+ +        /* The CP_UTF7 decoder only supports flags=0 */
+ +        return 0;
+ +    }
+ +    else
+ +        return MB_ERR_INVALID_CHARS;
+ +}
+ +
+ +/*
+ + * Decode a byte string from a Windows code page into unicode object in strict
+ + * mode.
+ + *
+ + * Returns consumed size if succeed, returns -2 on decode error, or raise a
+ + * WindowsError and returns -1 on other error.
+ + */
+ +static int
+ +decode_code_page_strict(UINT code_page,
+ +                        PyObject **v,
+ +                        const char *in,
+ +                        int insize)
+ +{
+ +    const DWORD flags = decode_code_page_flags(code_page);
+ +    wchar_t *out;
+ +    DWORD outsize;
+ +
+ +    /* First get the size of the result */
+ +    assert(insize > 0);
+ +    outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
+ +    if (outsize <= 0)
+ +        goto error;
+ +
+ +    if (*v == NULL) {
+ +        /* Create unicode object */
+ +        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
+ +        *v = (PyObject*)_PyUnicode_New(outsize);
+ +        if (*v == NULL)
+ +            return -1;
+ +        out = PyUnicode_AS_UNICODE(*v);
+ +    }
+ +    else {
+ +        /* Extend unicode object */
+ +        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
+ +        if (unicode_resize(v, n + outsize) < 0)
+ +            return -1;
+ +        out = PyUnicode_AS_UNICODE(*v) + n;
+ +    }
+ +
+ +    /* Do the conversion */
+ +    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
+ +    if (outsize <= 0)
+ +        goto error;
+ +    return insize;
+ +
+ +error:
+ +    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
+ +        return -2;
+ +    PyErr_SetFromWindowsErr(0);
+ +    return -1;
+ +}
+ +
+ +/*
+ + * Decode a byte string from a code page into unicode object with an error
+ + * handler.
+ + *
+ + * Returns consumed size if succeed, or raise a WindowsError or
+ + * UnicodeDecodeError exception and returns -1 on error.
+ + */
+ +static int
+ +decode_code_page_errors(UINT code_page,
+ +                        PyObject **v,
+ +                        const char *in, const int size,
+ +                        const char *errors)
+ +{
+ +    const char *startin = in;
+ +    const char *endin = in + size;
+ +    const DWORD flags = decode_code_page_flags(code_page);
+ +    /* Ideally, we should get reason from FormatMessage. This is the Windows
+ +       2000 English version of the message. */
+ +    const char *reason = "No mapping for the Unicode character exists "
+ +                         "in the target code page.";
+ +    /* each step cannot decode more than 1 character, but a character can be
+ +       represented as a surrogate pair */
+ +    wchar_t buffer[2], *startout, *out;
+ +    int insize, outsize;
+ +    PyObject *errorHandler = NULL;
+ +    PyObject *exc = NULL;
+ +    PyObject *encoding_obj = NULL;
+ +    char *encoding;
+ +    DWORD err;
+ +    int ret = -1;
+ +
+ +    assert(size > 0);
+ +
+ +    encoding = code_page_name(code_page, &encoding_obj);
+ +    if (encoding == NULL)
+ +        return -1;
+ +
+ +    if (errors == NULL || strcmp(errors, "strict") == 0) {
+ +        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
+ +           UnicodeDecodeError. */
+ +        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
+ +        if (exc != NULL) {
+ +            PyCodec_StrictErrors(exc);
+ +            Py_CLEAR(exc);
+ +        }
+ +        goto error;
+ +    }
+ +
+ +    if (*v == NULL) {
+ +        /* Create unicode object */
+ +        if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
+ +            PyErr_NoMemory();
+ +            goto error;
+ +        }
+ +        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
+ +        *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
+ +        if (*v == NULL)
+ +            goto error;
+ +        startout = PyUnicode_AS_UNICODE(*v);
+ +    }
+ +    else {
+ +        /* Extend unicode object */
+ +        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
+ +        if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
+ +            PyErr_NoMemory();
+ +            goto error;
+ +        }
+ +        if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
+ +            goto error;
+ +        startout = PyUnicode_AS_UNICODE(*v) + n;
+ +    }
+ +
+ +    /* Decode the byte string character per character */
+ +    out = startout;
+ +    while (in < endin)
+ +    {
+ +        /* Decode a character */
+ +        insize = 1;
+ +        do
+ +        {
+ +            outsize = MultiByteToWideChar(code_page, flags,
+ +                                          in, insize,
+ +                                          buffer, Py_ARRAY_LENGTH(buffer));
+ +            if (outsize > 0)
+ +                break;
+ +            err = GetLastError();
+ +            if (err != ERROR_NO_UNICODE_TRANSLATION
+ +                && err != ERROR_INSUFFICIENT_BUFFER)
+ +            {
+ +                PyErr_SetFromWindowsErr(0);
+ +                goto error;
+ +            }
+ +            insize++;
+ +        }
+ +        /* 4=maximum length of a UTF-8 sequence */
+ +        while (insize <= 4 && (in + insize) <= endin);
+ +
+ +        if (outsize <= 0) {
+ +            Py_ssize_t startinpos, endinpos, outpos;
+ +
+ +            startinpos = in - startin;
+ +            endinpos = startinpos + 1;
+ +            outpos = out - PyUnicode_AS_UNICODE(*v);
+ +            if (unicode_decode_call_errorhandler(
+ +                    errors, &errorHandler,
+ +                    encoding, reason,
+ +                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
+ +                    v, &outpos))
+ +            {
+ +                goto error;
+ +            }
+ +            out = PyUnicode_AS_UNICODE(*v) + outpos;
+ +        }
+ +        else {
+ +            in += insize;
+ +            memcpy(out, buffer, outsize * sizeof(wchar_t));
+ +            out += outsize;
+ +        }
+ +    }
+ +
+ +    /* write a NUL character at the end */
+ +    *out = 0;
+ +
+ +    /* Extend unicode object */
+ +    outsize = out - startout;
+ +    assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
+ +    if (unicode_resize(v, outsize) < 0)
+ +        goto error;
+ +    ret = size;
+ +
+ +error:
+ +    Py_XDECREF(encoding_obj);
+ +    Py_XDECREF(errorHandler);
+ +    Py_XDECREF(exc);
+ +    return ret;
+ +}
+ +
+ +static PyObject *
+ +decode_code_page_stateful(int code_page,
+ +                          const char *s, Py_ssize_t size,
+ +                          const char *errors, Py_ssize_t *consumed)
+ +{
+ +    PyObject *v = NULL;
+ +    int chunk_size, final, converted, done;
+ +
+ +    if (code_page < 0) {
+ +        PyErr_SetString(PyExc_ValueError, "invalid code page number");
+ +        return NULL;
+ +    }
+ +
+ +    if (consumed)
+ +        *consumed = 0;
+ +
+ +    do
+ +    {
+ +#ifdef NEED_RETRY
+ +        if (size > INT_MAX) {
+ +            chunk_size = INT_MAX;
+ +            final = 0;
+ +            done = 0;
+ +        }
+ +        else
+ +#endif
+ +        {
+ +            chunk_size = (int)size;
+ +            final = (consumed == NULL);
+ +            done = 1;
+ +        }
+ +
+ +        /* Skip trailing lead-byte unless 'final' is set */
+ +        if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
+ +            --chunk_size;
+ +
+ +        if (chunk_size == 0 && done) {
+ +            if (v != NULL)
+ +                break;
-             Py_INCREF(unicode_empty);
-             return unicode_empty;
++            _Py_RETURN_UNICODE_EMPTY();
+ +        }
+ +
+ +
+ +        converted = decode_code_page_strict(code_page, &v,
+ +                                            s, chunk_size);
+ +        if (converted == -2)
+ +            converted = decode_code_page_errors(code_page, &v,
+ +                                                s, chunk_size,
+ +                                                errors);
+ +        assert(converted != 0);
+ +
+ +        if (converted < 0) {
+ +            Py_XDECREF(v);
+ +            return NULL;
+ +        }
+ +
+ +        if (consumed)
+ +            *consumed += converted;
+ +
+ +        s += converted;
+ +        size -= converted;
+ +    } while (!done);
+ +
+ +    return unicode_result(v);
+ +}
+ +
+ +PyObject *
+ +PyUnicode_DecodeCodePageStateful(int code_page,
+ +                                 const char *s,
+ +                                 Py_ssize_t size,
+ +                                 const char *errors,
+ +                                 Py_ssize_t *consumed)
+ +{
+ +    return decode_code_page_stateful(code_page, s, size, errors, consumed);
+ +}
+ +
+ +PyObject *
+ +PyUnicode_DecodeMBCSStateful(const char *s,
+ +                             Py_ssize_t size,
+ +                             const char *errors,
+ +                             Py_ssize_t *consumed)
+ +{
+ +    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
+ +}
   
- -static int is_dbcs_lead_byte(const char *s, int offset)
+ +PyObject *
+ +PyUnicode_DecodeMBCS(const char *s,
+ +                     Py_ssize_t size,
+ +                     const char *errors)
   {
- -    const char *curr = s + offset;
+ +    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
+ +}
   
- -    if (IsDBCSLeadByte(*curr)) {
- -        const char *prev = CharPrev(s, curr);
- -        return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
+ +static DWORD
+ +encode_code_page_flags(UINT code_page, const char *errors)
+ +{
+ +    if (code_page == CP_UTF8) {
+ +        if (winver.dwMajorVersion >= 6)
+ +            /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
+ +               and later */
+ +            return WC_ERR_INVALID_CHARS;
+ +        else
+ +            /* CP_UTF8 only supports flags=0 on Windows older than Vista */
+ +            return 0;
+ +    }
+ +    else if (code_page == CP_UTF7) {
+ +        /* CP_UTF7 only supports flags=0 */
+ +        return 0;
+ +    }
+ +    else {
+ +        if (errors != NULL && strcmp(errors, "replace") == 0)
+ +            return 0;
+ +        else
+ +            return WC_NO_BEST_FIT_CHARS;
       }
- -    return 0;
   }
   
   /*
@@@ -9502,24 -6803,18 +9503,22 @@@ PyUnicode_Join(PyObject *separator, PyO
       seqlen = PySequence_Fast_GET_SIZE(fseq);
       /* If empty sequence, return u"". */
       if (seqlen == 0) {
- -        res = _PyUnicode_New(0);  /* empty sequence; return u"" */
- -        goto Done;
+ +        Py_DECREF(fseq);
-         Py_INCREF(unicode_empty);
-         res = unicode_empty;
-         return res;
++        _Py_RETURN_UNICODE_EMPTY();
       }
- -    items = PySequence_Fast_ITEMS(fseq);
+ +
       /* If singleton sequence with an exact Unicode, return that. */
+ +    last_obj = NULL;
+ +    items = PySequence_Fast_ITEMS(fseq);
       if (seqlen == 1) {
- -        item = items[0];
- -        if (PyUnicode_CheckExact(item)) {
- -            Py_INCREF(item);
- -            res = (PyUnicodeObject *)item;
- -            goto Done;
+ +        if (PyUnicode_CheckExact(items[0])) {
+ +            res = items[0];
+ +            Py_INCREF(res);
+ +            Py_DECREF(fseq);
+ +            return res;
           }
+ +        seplen = 0;
+ +        maxchar = 0;
       }
       else {
           /* Set up sep and seplen */
@@@ -10052,180 -6991,96 +10051,182 @@@ replace(PyObject *self, PyObject *str1
   
       if (maxcount < 0)
           maxcount = PY_SSIZE_T_MAX;
- -    else if (maxcount == 0 || self->length == 0)
+ +    else if (maxcount == 0 || slen == 0)
           goto nothing;
   
- -    if (str1->length == str2->length) {
- -        Py_ssize_t i;
+ +    if (str1 == str2)
+ +        goto nothing;
+ +    if (skind < kind1)
+ +        /* substring too wide to be present */
+ +        goto nothing;
+ +
+ +    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
+ +    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
+ +    /* Replacing str1 with str2 may cause a maxchar reduction in the
+ +       result string. */
+ +    mayshrink = (maxchar_str2 < maxchar);
+ +    maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
+ +
+ +    if (len1 == len2) {
           /* same length */
- -        if (str1->length == 0)
+ +        if (len1 == 0)
               goto nothing;
- -        if (str1->length == 1) {
+ +        if (len1 == 1) {
               /* replace characters */
- -            Py_UNICODE u1, u2;
- -            if (!findchar(self->str, self->length, str1->str[0]))
+ +            Py_UCS4 u1, u2;
+ +            int rkind;
+ +            Py_ssize_t index, pos;
+ +            char *src;
+ +
+ +            u1 = PyUnicode_READ_CHAR(str1, 0);
+ +            pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
+ +            if (pos < 0)
                   goto nothing;
- -            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
+ +            u2 = PyUnicode_READ_CHAR(str2, 0);
+ +            u = PyUnicode_New(slen, maxchar);
               if (!u)
- -                return NULL;
- -            Py_UNICODE_COPY(u->str, self->str, self->length);
- -            u1 = str1->str[0];
- -            u2 = str2->str[0];
- -            for (i = 0; i < u->length; i++)
- -                if (u->str[i] == u1) {
- -                    if (--maxcount < 0)
- -                        break;
- -                    u->str[i] = u2;
- -                }
- -        } else {
- -            i = stringlib_find(
- -                self->str, self->length, str1->str, str1->length, 0
- -                );
+ +                goto error;
+ +            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
+ +            rkind = PyUnicode_KIND(u);
+ +
+ +            PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
+ +            index = 0;
+ +            src = sbuf;
+ +            while (--maxcount)
+ +            {
+ +                pos++;
+ +                src += pos * PyUnicode_KIND(self);
+ +                slen -= pos;
+ +                index += pos;
+ +                pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
+ +                if (pos < 0)
+ +                    break;
+ +                PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
+ +            }
+ +        }
+ +        else {
+ +            int rkind = skind;
+ +            char *res;
+ +            Py_ssize_t i;
+ +
+ +            if (kind1 < rkind) {
+ +                /* widen substring */
+ +                buf1 = _PyUnicode_AsKind(str1, rkind);
+ +                if (!buf1) goto error;
+ +                release1 = 1;
+ +            }
+ +            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
               if (i < 0)
                   goto nothing;
- -            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
+ +            if (rkind > kind2) {
+ +                /* widen replacement */
+ +                buf2 = _PyUnicode_AsKind(str2, rkind);
+ +                if (!buf2) goto error;
+ +                release2 = 1;
+ +            }
+ +            else if (rkind < kind2) {
+ +                /* widen self and buf1 */
+ +                rkind = kind2;
+ +                if (release1) PyMem_Free(buf1);
+ +                release1 = 0;
+ +                sbuf = _PyUnicode_AsKind(self, rkind);
+ +                if (!sbuf) goto error;
+ +                srelease = 1;
+ +                buf1 = _PyUnicode_AsKind(str1, rkind);
+ +                if (!buf1) goto error;
+ +                release1 = 1;
+ +            }
+ +            u = PyUnicode_New(slen, maxchar);
               if (!u)
- -                return NULL;
- -            Py_UNICODE_COPY(u->str, self->str, self->length);
+ +                goto error;
+ +            assert(PyUnicode_KIND(u) == rkind);
+ +            res = PyUnicode_DATA(u);
   
+ +            memcpy(res, sbuf, rkind * slen);
               /* change everything in-place, starting with this one */
- -            Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
- -            i += str1->length;
+ +            memcpy(res + rkind * i,
+ +                   buf2,
+ +                   rkind * len2);
+ +            i += len1;
   
               while ( --maxcount > 0) {
- -                i = stringlib_find(self->str+i, self->length-i,
- -                                   str1->str, str1->length,
- -                                   i);
+ +                i = anylib_find(rkind, self,
+ +                                sbuf+rkind*i, slen-i,
+ +                                str1, buf1, len1, i);
                   if (i == -1)
                       break;
- -                Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
- -                i += str1->length;
+ +                memcpy(res + rkind * i,
+ +                       buf2,
+ +                       rkind * len2);
+ +                i += len1;
               }
           }
- -    } else {
- -
- -        Py_ssize_t n, i, j;
- -        Py_ssize_t product, new_size, delta;
- -        Py_UNICODE *p;
- -
- -        /* replace strings */
- -        n = stringlib_count(self->str, self->length, str1->str, str1->length,
- -                            maxcount);
+ +    }
+ +    else {
+ +        Py_ssize_t n, i, j, ires;
+ +        Py_ssize_t new_size;
+ +        int rkind = skind;
+ +        char *res;
+ +
+ +        if (kind1 < rkind) {
+ +            /* widen substring */
+ +            buf1 = _PyUnicode_AsKind(str1, rkind);
+ +            if (!buf1) goto error;
+ +            release1 = 1;
+ +        }
+ +        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
           if (n == 0)
               goto nothing;
- -        /* new_size = self->length + n * (str2->length - str1->length)); */
- -        delta = (str2->length - str1->length);
- -        if (delta == 0) {
- -            new_size = self->length;
- -        } else {
- -            product = n * (str2->length - str1->length);
- -            if ((product / (str2->length - str1->length)) != n) {
- -                PyErr_SetString(PyExc_OverflowError,
- -                                "replace string is too long");
- -                return NULL;
- -            }
- -            new_size = self->length + product;
- -            if (new_size < 0) {
+ +        if (kind2 < rkind) {
+ +            /* widen replacement */
+ +            buf2 = _PyUnicode_AsKind(str2, rkind);
+ +            if (!buf2) goto error;
+ +            release2 = 1;
+ +        }
+ +        else if (kind2 > rkind) {
+ +            /* widen self and buf1 */
+ +            rkind = kind2;
+ +            sbuf = _PyUnicode_AsKind(self, rkind);
+ +            if (!sbuf) goto error;
+ +            srelease = 1;
+ +            if (release1) PyMem_Free(buf1);
+ +            release1 = 0;
+ +            buf1 = _PyUnicode_AsKind(str1, rkind);
+ +            if (!buf1) goto error;
+ +            release1 = 1;
+ +        }
+ +        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
+ +           PyUnicode_GET_LENGTH(str1))); */
+ +        if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
                   PyErr_SetString(PyExc_OverflowError,
                                   "replace string is too long");
- -                return NULL;
- -            }
+ +                goto error;
           }
- -        u = _PyUnicode_New(new_size);
+ +        new_size = slen + n * (len2 - len1);
+ +        if (new_size == 0) {
-             Py_INCREF(unicode_empty);
++            _Py_INCREF_UNICODE_EMPTY();
++            if (!unicode_empty)
++                goto error;
+ +            u = unicode_empty;
+ +            goto done;
+ +        }
+ +        if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
+ +            PyErr_SetString(PyExc_OverflowError,
+ +                            "replace string is too long");
+ +            goto error;
+ +        }
+ +        u = PyUnicode_New(new_size, maxchar);
           if (!u)
- -            return NULL;
- -        i = 0;
- -        p = u->str;
- -        if (str1->length > 0) {
+ +            goto error;
+ +        assert(PyUnicode_KIND(u) == rkind);
+ +        res = PyUnicode_DATA(u);
+ +        ires = i = 0;
+ +        if (len1 > 0) {
               while (n-- > 0) {
                   /* look for next match */
- -                j = stringlib_find(self->str+i, self->length-i,
- -                                   str1->str, str1->length,
- -                                   i);
+ +                j = anylib_find(rkind, self,
+ +                                sbuf + rkind * i, slen-i,
+ +                                str1, buf1, len1, i);
                   if (j == -1)
                       break;
                   else if (j > i) {
@@@ -11649,61 -8208,20 +11650,59 @@@ _PyUnicode_XStrip(PyObject *self, int s
           j++;
       }
   
- -    if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
- -        Py_INCREF(self);
- -        return (PyObject*)self;
- -    }
- -    else
- -        return PyUnicode_FromUnicode(s+i, j-i);
+ +    return PyUnicode_Substring(self, i, j);
   }
   
-     if (start >= length || end < start) {
-         Py_INCREF(unicode_empty);
-         return unicode_empty;
-     }
+ +PyObject*
+ +PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
+ +{
+ +    unsigned char *data;
+ +    int kind;
+ +    Py_ssize_t length;
+ +
+ +    if (PyUnicode_READY(self) == -1)
+ +        return NULL;
+ +
+ +    length = PyUnicode_GET_LENGTH(self);
+ +    end = Py_MIN(end, length);
+ +
+ +    if (start == 0 && end == length)
+ +        return unicode_result_unchanged(self);
+ +
+ +    if (start < 0 || end < 0) {
+ +        PyErr_SetString(PyExc_IndexError, "string index out of range");
+ +        return NULL;
+ +    }
++    if (start >= length || end < start)
++        _Py_RETURN_UNICODE_EMPTY();
+ +
+ +    length = end - start;
+ +    if (PyUnicode_IS_ASCII(self)) {
+ +        data = PyUnicode_1BYTE_DATA(self);
+ +        return _PyUnicode_FromASCII((char*)(data + start), length);
+ +    }
+ +    else {
+ +        kind = PyUnicode_KIND(self);
+ +        data = PyUnicode_1BYTE_DATA(self);
+ +        return PyUnicode_FromKindAndData(kind,
+ +                                         data + kind * start,
+ +                                         length);
+ +    }
+ +}
   
   static PyObject *
- -do_strip(PyUnicodeObject *self, int striptype)
+ +do_strip(PyObject *self, int striptype)
   {
- -    Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
- -    Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
+ +    int kind;
+ +    void *data;
+ +    Py_ssize_t len, i, j;
+ +
+ +    if (PyUnicode_READY(self) == -1)
+ +        return NULL;
+ +
+ +    kind = PyUnicode_KIND(self);
+ +    data = PyUnicode_DATA(self);
+ +    len = PyUnicode_GET_LENGTH(self);
   
       i = 0;
       if (striptype != RIGHTSTRIP) {
@@@ -11797,24 -8320,33 +11796,22 @@@ unicode_rstrip(PyObject *self, PyObjec
   
   
   static PyObject*
- -unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
+ +unicode_repeat(PyObject *str, Py_ssize_t len)
   {
- -    PyUnicodeObject *u;
- -    Py_UNICODE *p;
- -    Py_ssize_t nchars;
- -    size_t nbytes;
+ +    PyObject *u;
+ +    Py_ssize_t nchars, n;
   
-     if (len < 1) {
-         Py_INCREF(unicode_empty);
-         return unicode_empty;
-     }
+     if (len < 1)
+         _Py_RETURN_UNICODE_EMPTY();
   
- -    if (len == 1 && PyUnicode_CheckExact(str)) {
- -        /* no repeat, return original string */
- -        Py_INCREF(str);
- -        return (PyObject*) str;
- -    }
+ +    /* no repeat, return original string */
+ +    if (len == 1)
+ +        return unicode_result_unchanged(str);
   
- -    /* ensure # of chars needed doesn't overflow int and # of bytes
- -     * needed doesn't overflow size_t
- -     */
- -    nchars = len * str->length;
- -    if (nchars / len != str->length) {
- -        PyErr_SetString(PyExc_OverflowError,
- -                        "repeated string is too long");
+ +    if (PyUnicode_READY(str) == -1)
           return NULL;
- -    }
- -    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
- -    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
+ +
+ +    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
           PyErr_SetString(PyExc_OverflowError,
                           "repeated string is too long");
           return NULL;
@@@ -12797,160 -9186,7 +12794,159 @@@ unicode_endswith(PyObject *self
       return PyBool_FromLong(result);
   }
   
- -#include "stringlib/string_format.h"
+ +Py_LOCAL_INLINE(void)
+ +_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
+ +{
+ +    writer->size = PyUnicode_GET_LENGTH(writer->buffer);
+ +    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
+ +    writer->data = PyUnicode_DATA(writer->buffer);
+ +    writer->kind = PyUnicode_KIND(writer->buffer);
+ +}
+ +
+ +void
+ +_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
+ +{
+ +    memset(writer, 0, sizeof(*writer));
+ +#ifdef Py_DEBUG
+ +    writer->kind = 5;    /* invalid kind */
+ +#endif
+ +    writer->min_length = Py_MAX(min_length, 100);
+ +    writer->overallocate = (min_length > 0);
+ +}
+ +
+ +int
+ +_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
+ +                                 Py_ssize_t length, Py_UCS4 maxchar)
+ +{
+ +    Py_ssize_t newlen;
+ +    PyObject *newbuffer;
+ +
+ +    assert(length > 0);
+ +
+ +    if (length > PY_SSIZE_T_MAX - writer->pos) {
+ +        PyErr_NoMemory();
+ +        return -1;
+ +    }
+ +    newlen = writer->pos + length;
+ +
+ +    if (writer->buffer == NULL) {
+ +        if (writer->overallocate) {
+ +            /* overallocate 25% to limit the number of resize */
+ +            if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
+ +                newlen += newlen / 4;
+ +            if (newlen < writer->min_length)
+ +                newlen = writer->min_length;
+ +        }
+ +        writer->buffer = PyUnicode_New(newlen, maxchar);
+ +        if (writer->buffer == NULL)
+ +            return -1;
+ +        _PyUnicodeWriter_Update(writer);
+ +        return 0;
+ +    }
+ +
+ +    if (newlen > writer->size) {
+ +        if (writer->overallocate) {
+ +            /* overallocate 25% to limit the number of resize */
+ +            if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
+ +                newlen += newlen / 4;
+ +            if (newlen < writer->min_length)
+ +                newlen = writer->min_length;
+ +        }
+ +
+ +        if (maxchar > writer->maxchar || writer->readonly) {
+ +            /* resize + widen */
+ +            newbuffer = PyUnicode_New(newlen, maxchar);
+ +            if (newbuffer == NULL)
+ +                return -1;
+ +            _PyUnicode_FastCopyCharacters(newbuffer, 0,
+ +                                          writer->buffer, 0, writer->pos);
+ +            Py_DECREF(writer->buffer);
+ +            writer->readonly = 0;
+ +        }
+ +        else {
+ +            newbuffer = resize_compact(writer->buffer, newlen);
+ +            if (newbuffer == NULL)
+ +                return -1;
+ +        }
+ +        writer->buffer = newbuffer;
+ +        _PyUnicodeWriter_Update(writer);
+ +    }
+ +    else if (maxchar > writer->maxchar) {
+ +        assert(!writer->readonly);
+ +        newbuffer = PyUnicode_New(writer->size, maxchar);
+ +        if (newbuffer == NULL)
+ +            return -1;
+ +        _PyUnicode_FastCopyCharacters(newbuffer, 0,
+ +                                      writer->buffer, 0, writer->pos);
+ +        Py_DECREF(writer->buffer);
+ +        writer->buffer = newbuffer;
+ +        _PyUnicodeWriter_Update(writer);
+ +    }
+ +    return 0;
+ +}
+ +
+ +int
+ +_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
+ +{
+ +    Py_UCS4 maxchar;
+ +    Py_ssize_t len;
+ +
+ +    if (PyUnicode_READY(str) == -1)
+ +        return -1;
+ +    len = PyUnicode_GET_LENGTH(str);
+ +    if (len == 0)
+ +        return 0;
+ +    maxchar = PyUnicode_MAX_CHAR_VALUE(str);
+ +    if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
+ +        if (writer->buffer == NULL && !writer->overallocate) {
+ +            Py_INCREF(str);
+ +            writer->buffer = str;
+ +            _PyUnicodeWriter_Update(writer);
+ +            writer->readonly = 1;
+ +            writer->size = 0;
+ +            writer->pos += len;
+ +            return 0;
+ +        }
+ +        if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
+ +            return -1;
+ +    }
+ +    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
+ +                                  str, 0, len);
+ +    writer->pos += len;
+ +    return 0;
+ +}
+ +
+ +PyObject *
+ +_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
+ +{
+ +    if (writer->pos == 0) {
+ +        Py_XDECREF(writer->buffer);
-         Py_INCREF(unicode_empty);
-         return unicode_empty;
++        _Py_RETURN_UNICODE_EMPTY();
+ +    }
+ +    if (writer->readonly) {
+ +        assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
+ +        return writer->buffer;
+ +    }
+ +    if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
+ +        PyObject *newbuffer;
+ +        newbuffer = resize_compact(writer->buffer, writer->pos);
+ +        if (newbuffer == NULL) {
+ +            Py_DECREF(writer->buffer);
+ +            return NULL;
+ +        }
+ +        writer->buffer = newbuffer;
+ +    }
+ +    assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
+ +    return writer->buffer;
+ +}
+ +
+ +void
+ +_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
+ +{
+ +    Py_CLEAR(writer->buffer);
+ +}
+ +
+ +#include "stringlib/unicode_format.h"
   
   PyDoc_STRVAR(format__doc__,
                "S.format(*args, **kwargs) -> str\n\
@@@ -13143,44 -9345,29 +13139,43 @@@ unicode_subscript(PyObject* self, PyObj
           }
   
           if (slicelength <= 0) {
-             Py_INCREF(unicode_empty);
-             return unicode_empty;
- -            return PyUnicode_FromUnicode(NULL, 0);
- -        } else if (start == 0 && step == 1 && slicelength == self->length &&
- -                   PyUnicode_CheckExact(self)) {
- -            Py_INCREF(self);
- -            return (PyObject *)self;
++            _Py_RETURN_UNICODE_EMPTY();
+ +        } else if (start == 0 && step == 1 &&
+ +                   slicelength == PyUnicode_GET_LENGTH(self)) {
+ +            return unicode_result_unchanged(self);
           } else if (step == 1) {
- -            return PyUnicode_FromUnicode(self->str + start, slicelength);
- -        } else {
- -            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
- -            result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
- -                                                       sizeof(Py_UNICODE));
- -
- -            if (result_buf == NULL)
- -                return PyErr_NoMemory();
- -
+ +            return PyUnicode_Substring(self,
+ +                                       start, start + slicelength);
+ +        }
+ +        /* General case */
+ +        src_kind = PyUnicode_KIND(self);
+ +        src_data = PyUnicode_DATA(self);
+ +        if (!PyUnicode_IS_ASCII(self)) {
+ +            kind_limit = kind_maxchar_limit(src_kind);
+ +            max_char = 0;
               for (cur = start, i = 0; i < slicelength; cur += step, i++) {
- -                result_buf[i] = source_buf[cur];
+ +                ch = PyUnicode_READ(src_kind, src_data, cur);
+ +                if (ch > max_char) {
+ +                    max_char = ch;
+ +                    if (max_char >= kind_limit)
+ +                        break;
+ +                }
               }
+ +        }
+ +        else
+ +            max_char = 127;
+ +        result = PyUnicode_New(slicelength, max_char);
+ +        if (result == NULL)
+ +            return NULL;
+ +        dest_kind = PyUnicode_KIND(result);
+ +        dest_data = PyUnicode_DATA(result);
   
- -            result = PyUnicode_FromUnicode(result_buf, slicelength);
- -            PyObject_FREE(result_buf);
- -            return result;
+ +        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
+ +            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
+ +            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
           }
+ +        assert(_PyUnicode_CheckConsistency(result, 1));
+ +        return result;
       } else {
           PyErr_SetString(PyExc_TypeError, "string indices must be integers");
           return NULL;
@@@ -13974,10 -9958,8 +13969,8 @@@ unicode_new(PyTypeObject *type, PyObjec
       if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
                                        kwlist, &x, &encoding, &errors))
           return NULL;
-     if (x == NULL) {
-         Py_INCREF(unicode_empty);
-         return unicode_empty;
-     }
+     if (x == NULL)
- -        return (PyObject *)_PyUnicode_New(0);
++        _Py_RETURN_UNICODE_EMPTY();
       if (encoding == NULL && errors == NULL)
           return PyObject_Str(x);
       else
@@@ -14144,12 -10056,10 +14137,10 @@@ PyTypeObject PyUnicode_Type = 
   
   /* Initialize the Unicode implementation */
   
- -void _PyUnicode_Init(void)
+ +int _PyUnicode_Init(void)
   {
-     int i;
- 
       /* XXX - move this array to unicodectype.c ? */
- -    Py_UNICODE linebreak[] = {
+ +    Py_UCS2 linebreak[] = {
           0x000A, /* LINE FEED */
           0x000D, /* CARRIAGE RETURN */
           0x001C, /* FILE SEPARATOR */
@@@ -14161,13 -10071,12 +14152,11 @@@
       };
   
       /* Init the implementation */
-     unicode_empty = PyUnicode_New(0, 0);
- -    if (!unicode_empty) {
- -        unicode_empty = _PyUnicode_New(0);
- -        if (!unicode_empty)
- -            return;
- -    }
++    _Py_INCREF_UNICODE_EMPTY();
+ +    if (!unicode_empty)
+ +        Py_FatalError("Can't create empty string");
-     assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
++    Py_DECREF(unicode_empty);
   
-     for (i = 0; i < 256; i++)
-         unicode_latin1[i] = NULL;
       if (PyType_Ready(&PyUnicode_Type) < 0)
           Py_FatalError("Can't initialize 'unicode'");
   
@@@ -14207,16 -10121,11 +14196,11 @@@ _PyUnicode_Fini(void
   {
       int i;
   
-     Py_XDECREF(unicode_empty);
-     unicode_empty = NULL;
+     Py_CLEAR(unicode_empty);
   
-     for (i = 0; i < 256; i++) {
-         if (unicode_latin1[i]) {
-             Py_DECREF(unicode_latin1[i]);
-             unicode_latin1[i] = NULL;
-         }
-     }
+     for (i = 0; i < 256; i++)
+         Py_CLEAR(unicode_latin1[i]);
- -
+ +    _PyUnicode_ClearStaticStrings();
       (void)PyUnicode_ClearFreeList();
   }
author	Serhiy Storchaka <storchaka@gmail.com>
	Sat, 26 Jan 2013 10:16:36 +0000 (12:16 +0200)
committer	Serhiy Storchaka <storchaka@gmail.com>
	Sat, 26 Jan 2013 10:16:36 +0000 (12:16 +0200)
		1	2
Misc/NEWS	patch \|	diff1 \|	diff2 \|	blob \| history
Objects/unicodeobject.c	patch \|	diff1 \|	diff2 \|	blob \| history