Issue #10156: In the interpreter's initialization phase, unicode globals

author Serhiy Storchaka <storchaka@gmail.com>

Sat, 26 Jan 2013 10:18:17 +0000 (12:18 +0200)

committer Serhiy Storchaka <storchaka@gmail.com>

Sat, 26 Jan 2013 10:18:17 +0000 (12:18 +0200)
author Serhiy Storchaka <storchaka@gmail.com>
Sat, 26 Jan 2013 10:18:17 +0000 (12:18 +0200)
committer Serhiy Storchaka <storchaka@gmail.com>
Sat, 26 Jan 2013 10:18:17 +0000 (12:18 +0200)
diff --cc Misc/NEWS
Simple merge
diff --cc Objects/unicodeobject.c

index 5030e8d6349e382cfe29410acb59c57e6f3abe28,c96a91c3973218b3a30000e31e714eb725b1f1bb..b4f4185caacf25acac01d6922a330619a87f1af9
--- 1/Objects/unicodeobject.c
--- 2/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@@ -47,10 -47,19 +47,11 @@@ OF OR IN CONNECTION WITH THE USE OR PER
   #include <windows.h>
   #endif
   
- -/* Endianness switches; defaults to little endian */
- -
- -#ifdef WORDS_BIGENDIAN
- -# define BYTEORDER_IS_BIG_ENDIAN
- -#else
- -# define BYTEORDER_IS_LITTLE_ENDIAN
- -#endif
- -
   /* --- Globals ------------------------------------------------------------
   
-    The globals are initialized by the _PyUnicode_Init() API and should
-    not be used before calling that API.
+ NOTE: In the interpreter's initialization phase, some globals are currently
+       initialized dynamically as needed. In the process Unicode objects may
+       be created before the Unicode type is ready.
   
   */
   
@@@ -404,11 -432,12 +424,10 @@@ unicode_result_wchar(PyObject *unicode
   #ifndef Py_DEBUG
       Py_ssize_t len;
   
- -    assert(Py_REFCNT(unicode) == 1);
- -
       len = _PyUnicode_WSTR_LENGTH(unicode);
       if (len == 0) {
-         Py_INCREF(unicode_empty);
           Py_DECREF(unicode);
-         return unicode_empty;
+         _Py_RETURN_UNICODE_EMPTY();
       }
   
       if (len == 1) {
@@@ -4201,16 -4330,14 +4207,15 @@@ PyUnicode_DecodeUTF7Stateful(const cha
       if (size == 0) {
           if (consumed)
               *consumed = 0;
-         Py_INCREF(unicode_empty);
-         return unicode_empty;
- -        return unicode;
++        _Py_RETURN_UNICODE_EMPTY();
       }
   
- -    shiftOutStart = outpos = 0;
+ +    /* Start off assuming it's all ASCII. Widen later as necessary. */
+ +    _PyUnicodeWriter_Init(&writer, 0);
+ +    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
+ +        goto onError;
+ +
+ +    shiftOutStart = 0;
       e = s + size;
   
       while (s < e) {
@@@ -4851,92 -4980,84 +4855,91 @@@ PyUnicode_DecodeUTF32Stateful(const cha
          byte order setting accordingly. In native mode, the leading BOM
          mark is skipped, in all other modes, it is copied to the output
          stream as-is (giving a ZWNBSP character). */
- -    if (bo == 0) {
- -        if (size >= 4) {
- -            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
- -                (q[iorder[1]] << 8) | q[iorder[0]];
- -#ifdef BYTEORDER_IS_LITTLE_ENDIAN
- -            if (bom == 0x0000FEFF) {
- -                q += 4;
- -                bo = -1;
- -            }
- -            else if (bom == 0xFFFE0000) {
- -                q += 4;
- -                bo = 1;
- -            }
- -#else
- -            if (bom == 0x0000FEFF) {
- -                q += 4;
- -                bo = 1;
- -            }
- -            else if (bom == 0xFFFE0000) {
- -                q += 4;
- -                bo = -1;
- -            }
- -#endif
+ +    if (bo == 0 && size >= 4) {
+ +        Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
+ +        if (bom == 0x0000FEFF) {
+ +            bo = -1;
+ +            q += 4;
+ +        }
+ +        else if (bom == 0xFFFE0000) {
+ +            bo = 1;
+ +            q += 4;
           }
+ +        if (byteorder)
+ +            *byteorder = bo;
       }
   
- -    if (bo == -1) {
- -        /* force LE */
- -        iorder[0] = 0;
- -        iorder[1] = 1;
- -        iorder[2] = 2;
- -        iorder[3] = 3;
- -    }
- -    else if (bo == 1) {
- -        /* force BE */
- -        iorder[0] = 3;
- -        iorder[1] = 2;
- -        iorder[2] = 1;
- -        iorder[3] = 0;
+ +    if (q == e) {
+ +        if (consumed)
+ +            *consumed = size;
-         Py_INCREF(unicode_empty);
-         return unicode_empty;
++        _Py_RETURN_UNICODE_EMPTY();
       }
   
- -    /* This might be one to much, because of a BOM */
- -    unicode = PyUnicode_New((size+3)/4, 127);
- -    if (!unicode)
- -        return NULL;
- -    if (size == 0)
- -        return unicode;
- -    outpos = 0;
+ +#ifdef WORDS_BIGENDIAN
+ +    le = bo < 0;
+ +#else
+ +    le = bo <= 0;
+ +#endif
   
- -    while (q < e) {
- -        Py_UCS4 ch;
- -        /* remaining bytes at the end? (size should be divisible by 4) */
- -        if (e-q<4) {
- -            if (consumed)
+ +    _PyUnicodeWriter_Init(&writer, 0);
+ +    if (_PyUnicodeWriter_Prepare(&writer, (e - q + 3) / 4, 127) == -1)
+ +        goto onError;
+ +
+ +    while (1) {
+ +        Py_UCS4 ch = 0;
+ +        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
+ +
+ +        if (e - q >= 4) {
+ +            enum PyUnicode_Kind kind = writer.kind;
+ +            void *data = writer.data;
+ +            const unsigned char *last = e - 4;
+ +            Py_ssize_t pos = writer.pos;
+ +            if (le) {
+ +                do {
+ +                    ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
+ +                    if (ch > maxch)
+ +                        break;
+ +                    PyUnicode_WRITE(kind, data, pos++, ch);
+ +                    q += 4;
+ +                } while (q <= last);
+ +            }
+ +            else {
+ +                do {
+ +                    ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
+ +                    if (ch > maxch)
+ +                        break;
+ +                    PyUnicode_WRITE(kind, data, pos++, ch);
+ +                    q += 4;
+ +                } while (q <= last);
+ +            }
+ +            writer.pos = pos;
+ +        }
+ +
+ +        if (ch <= maxch) {
+ +            if (q == e || consumed)
                   break;
+ +            /* remaining bytes at the end? (size should be divisible by 4) */
               errmsg = "truncated data";
- -            startinpos = ((const char *)q)-starts;
- -            endinpos = ((const char *)e)-starts;
- -            goto utf32Error;
- -            /* The remaining input chars are ignored if the callback
- -               chooses to skip the input */
+ +            startinpos = ((const char *)q) - starts;
+ +            endinpos = ((const char *)e) - starts;
           }
- -        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
- -            (q[iorder[1]] << 8) | q[iorder[0]];
- -
- -        if (ch >= 0x110000)
- -        {
+ +        else {
+ +            if (ch < 0x110000) {
+ +                if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
+ +                    goto onError;
+ +                PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
+ +                writer.pos++;
+ +                q += 4;
+ +                continue;
+ +            }
               errmsg = "codepoint not in range(0x110000)";
- -            startinpos = ((const char *)q)-starts;
- -            endinpos = startinpos+4;
- -            goto utf32Error;
+ +            startinpos = ((const char *)q) - starts;
+ +            endinpos = startinpos + 4;
           }
- -        if (unicode_putchar(&unicode, &outpos, ch) < 0)
- -            goto onError;
- -        q += 4;
- -        continue;
- -      utf32Error:
- -        if (unicode_decode_call_errorhandler(
+ +
+ +        /* The remaining input chars are ignored if the callback
+ +           chooses to skip the input */
+ +        if (unicode_decode_call_errorhandler_writer(
                   errors, &errorHandler,
                   "utf32", errmsg,
                   &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
@@@ -5108,11 -5237,10 +5111,10 @@@ PyUnicode_DecodeUTF16Stateful(const cha
       if (q == e) {
           if (consumed)
               *consumed = size;
-         Py_INCREF(unicode_empty);
-         return unicode_empty;
+         _Py_RETURN_UNICODE_EMPTY();
       }
   
- -#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+ +#if PY_LITTLE_ENDIAN
       native_ordering = bo <= 0;
   #else
       native_ordering = bo >= 0;
@@@ -5384,12 -5516,9 +5386,10 @@@ PyUnicode_DecodeUnicodeEscape(const cha
       PyObject *errorHandler = NULL;
       PyObject *exc = NULL;
       Py_ssize_t len;
- -    Py_ssize_t i;
   
       len = length_of_escaped_ascii_string(s, size);
-     if (len == 0) {
-         Py_INCREF(unicode_empty);
-         return unicode_empty;
-     }
++    if (len == 0)
++        _Py_RETURN_UNICODE_EMPTY();
   
       /* After length_of_escaped_ascii_string() there are two alternatives,
          either the string is pure ASCII with named escapes like \n, etc.
@@@ -5781,11 -5915,6 +5781,9 @@@ PyUnicode_DecodeRawUnicodeEscape(const 
       PyObject *errorHandler = NULL;
       PyObject *exc = NULL;
   
-     if (size == 0) {
-         Py_INCREF(unicode_empty);
-         return unicode_empty;
-     }
++    if (size == 0)
++        _Py_RETURN_UNICODE_EMPTY();
+ +
       /* Escaped strings will always be longer than the resulting
          Unicode string, so we start with size here and then reduce the
          length after conversion to the true value. (But decoding error
@@@ -5988,15 -6113,13 +5986,13 @@@ _PyUnicode_DecodeUnicodeInternal(const 
                        1))
           return NULL;
   
-     if (size == 0) {
-         Py_INCREF(unicode_empty);
-         return unicode_empty;
-     }
++    if (size == 0)
++        _Py_RETURN_UNICODE_EMPTY();
+ +
       /* XXX overflow detection missing */
- -    v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
- -    if (v == NULL)
+ +    _PyUnicodeWriter_Init(&writer, 0);
+ +    if (_PyUnicodeWriter_Prepare(&writer, (size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127) == -1)
           goto onError;
- -    if (PyUnicode_GET_LENGTH(v) == 0)
- -        return v;
- -    outpos = 0;
       end = s + size;
   
       while (s < end) {
@@@ -7298,14 -7422,12 +7291,12 @@@ PyUnicode_DecodeCharmap(const char *s
       if (mapping == NULL)
           return PyUnicode_DecodeLatin1(s, size, errors);
   
-     if (size == 0) {
-         Py_INCREF(unicode_empty);
-         return unicode_empty;
-     }
- -    v = PyUnicode_New(size, 127);
- -    if (v == NULL)
- -        goto onError;
+     if (size == 0)
- -        return v;
- -    outpos = 0;
++        _Py_RETURN_UNICODE_EMPTY();
+ +    _PyUnicodeWriter_Init(&writer, 0);
+ +    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
+ +        goto onError;
+ +
       e = s + size;
       if (PyUnicode_CheckExact(mapping)) {
           Py_ssize_t maplen;
author	Serhiy Storchaka <storchaka@gmail.com>
	Sat, 26 Jan 2013 10:18:17 +0000 (12:18 +0200)
committer	Serhiy Storchaka <storchaka@gmail.com>
	Sat, 26 Jan 2013 10:18:17 +0000 (12:18 +0200)
		1	2
Misc/NEWS	patch \|	diff1 \|	diff2 \|	blob \| history
Objects/unicodeobject.c	patch \|	diff1 \|	diff2 \|	blob \| history