Issue #11461: Fix the incremental UTF-16 decoder. Original patch by

author Serhiy Storchaka <storchaka@gmail.com>

Tue, 8 Jan 2013 21:40:52 +0000 (23:40 +0200)

committer Serhiy Storchaka <storchaka@gmail.com>

Tue, 8 Jan 2013 21:40:52 +0000 (23:40 +0200)
author Serhiy Storchaka <storchaka@gmail.com>
Tue, 8 Jan 2013 21:40:52 +0000 (23:40 +0200)
committer Serhiy Storchaka <storchaka@gmail.com>
Tue, 8 Jan 2013 21:40:52 +0000 (23:40 +0200)
diff --cc Lib/test/test_codecs.py
Simple merge
diff --cc Misc/NEWS

index b40d4c7f0513fd634f3f0a421ccbc9e8a6dc9688,f51476aab4b76ef90af931506d59dcb29e01c0d5..8a5d14d443d3baa653635dfc4da0f14c02e88e00
--- 1/Misc/NEWS
--- 2/Misc/NEWS
+++ b/Misc/NEWS
@@@ -12,11 -10,9 +12,14 @@@ What's New in Python 3.3.1
   Core and Builtins
   -----------------
   
+ - Issue #11461: Fix the incremental UTF-16 decoder. Original patch by
+   Amaury Forgeot d'Arc.
+ 
+ +- Issue #16881: Fix Py_ARRAY_LENGTH macro for GCC < 3.1.
+ +
+ +- Issue #16856: Fix a segmentation fault from calling repr() on a dict with
+ +  a key whose repr raise an exception.
+ +
   - Issue #16367: Fix FileIO.readall() on Windows for files larger than 2 GB.
   
   - Issue #16455: On FreeBSD and Solaris, if the locale is C, the
diff --cc Objects/unicodeobject.c

index 1522a16ba6e34a81c2c45325623b3e3971694038,7f86bfd6df96a4c93a03c4619aae8cd22a4b5104..b4fc0040b6e5923b5a2647d938f749296acebe5d
--- 1/Objects/unicodeobject.c
--- 2/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@@ -5212,98 -3435,179 +5212,101 @@@ PyUnicode_DecodeUTF16Stateful(const cha
          byte order setting accordingly. In native mode, the leading BOM
          mark is skipped, in all other modes, it is copied to the output
          stream as-is (giving a ZWNBSP character). */
- -    if (bo == 0) {
- -        if (size >= 2) {
- -            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
- -#ifdef BYTEORDER_IS_LITTLE_ENDIAN
- -            if (bom == 0xFEFF) {
- -                q += 2;
- -                bo = -1;
- -            }
- -            else if (bom == 0xFFFE) {
- -                q += 2;
- -                bo = 1;
- -            }
- -#else
- -            if (bom == 0xFEFF) {
- -                q += 2;
- -                bo = 1;
- -            }
- -            else if (bom == 0xFFFE) {
- -                q += 2;
- -                bo = -1;
- -            }
- -#endif
+ +    if (bo == 0 && size >= 2) {
+ +        const Py_UCS4 bom = (q[1] << 8) | q[0];
+ +        if (bom == 0xFEFF) {
+ +            q += 2;
+ +            bo = -1;
+ +        }
+ +        else if (bom == 0xFFFE) {
+ +            q += 2;
+ +            bo = 1;
           }
+ +        if (byteorder)
+ +            *byteorder = bo;
       }
   
- -    if (bo == -1) {
- -        /* force LE */
- -        ihi = 1;
- -        ilo = 0;
- -    }
- -    else if (bo == 1) {
- -        /* force BE */
- -        ihi = 0;
- -        ilo = 1;
+ +    if (q == e) {
+ +        if (consumed)
+ +            *consumed = size;
+ +        Py_INCREF(unicode_empty);
+ +        return unicode_empty;
       }
- -#ifdef BYTEORDER_IS_LITTLE_ENDIAN
- -    native_ordering = ilo < ihi;
- -#else
- -    native_ordering = ilo > ihi;
- -#endif
- -
- -    aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
- -    while (1) {
- -        Py_UNICODE ch;
- -        if (e - q < 2) {
- -            /* remaining byte at the end? (size should be even) */
- -            if (q == e || consumed)
- -                break;
- -            errmsg = "truncated data";
- -            startinpos = ((const char *)q) - starts;
- -            endinpos = ((const char *)e) - starts;
- -            outpos = p - PyUnicode_AS_UNICODE(unicode);
- -            goto utf16Error;
- -            /* The remaining input chars are ignored if the callback
- -               chooses to skip the input */
- -        }
- -        /* First check for possible aligned read of a C 'long'. Unaligned
- -           reads are more expensive, better to defer to another iteration. */
- -        if (!((size_t) q & LONG_PTR_MASK)) {
- -            /* Fast path for runs of non-surrogate chars. */
- -            register const unsigned char *_q = q;
- -            Py_UNICODE *_p = p;
- -            if (native_ordering) {
- -                /* Native ordering is simple: as long as the input cannot
- -                   possibly contain a surrogate char, do an unrolled copy
- -                   of several 16-bit code points to the target object.
- -                   The non-surrogate check is done on several input bytes
- -                   at a time (as many as a C 'long' can contain). */
- -                while (_q < aligned_end) {
- -                    unsigned long data = * (unsigned long *) _q;
- -                    if (data & FAST_CHAR_MASK)
- -                        break;
- -                    _p[0] = ((unsigned short *) _q)[0];
- -                    _p[1] = ((unsigned short *) _q)[1];
- -#if (SIZEOF_LONG == 8)
- -                    _p[2] = ((unsigned short *) _q)[2];
- -                    _p[3] = ((unsigned short *) _q)[3];
- -#endif
- -                    _q += SIZEOF_LONG;
- -                    _p += SIZEOF_LONG / 2;
- -                }
- -            }
- -            else {
- -                /* Byteswapped ordering is similar, but we must decompose
- -                   the copy bytewise, and take care of zero'ing out the
- -                   upper bytes if the target object is in 32-bit units
- -                   (that is, in UCS-4 builds). */
- -                while (_q < aligned_end) {
- -                    unsigned long data = * (unsigned long *) _q;
- -                    if (data & SWAPPED_FAST_CHAR_MASK)
- -                        break;
- -                    /* Zero upper bytes in UCS-4 builds */
- -#if (Py_UNICODE_SIZE > 2)
- -                    _p[0] = 0;
- -                    _p[1] = 0;
- -#if (SIZEOF_LONG == 8)
- -                    _p[2] = 0;
- -                    _p[3] = 0;
- -#endif
- -#endif
- -                    /* Issue #4916; UCS-4 builds on big endian machines must
- -                       fill the two last bytes of each 4-byte unit. */
- -#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
- -# define OFF 2
- -#else
- -# define OFF 0
- -#endif
- -                    ((unsigned char *) _p)[OFF + 1] = _q[0];
- -                    ((unsigned char *) _p)[OFF + 0] = _q[1];
- -                    ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
- -                    ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
- -#if (SIZEOF_LONG == 8)
- -                    ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
- -                    ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
- -                    ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
- -                    ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
- -#endif
- -#undef OFF
- -                    _q += SIZEOF_LONG;
- -                    _p += SIZEOF_LONG / 2;
- -                }
- -            }
- -            p = _p;
- -            q = _q;
- -            if (e - q < 2)
- -                continue;
- -        }
- -        ch = (q[ihi] << 8) | q[ilo];
   
- -        q += 2;
+ +#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+ +    native_ordering = bo <= 0;
+ +#else
+ +    native_ordering = bo >= 0;
+ +#endif
   
- -        if (ch < 0xD800 || ch > 0xDFFF) {
- -            *p++ = ch;
- -            continue;
+ +    /* Note: size will always be longer than the resulting Unicode
+ +       character count */
+ +    unicode = PyUnicode_New((e - q + 1) / 2, 127);
+ +    if (!unicode)
+ +        return NULL;
+ +
+ +    outpos = 0;
+ +    while (1) {
+ +        Py_UCS4 ch = 0;
+ +        if (e - q >= 2) {
+ +            int kind = PyUnicode_KIND(unicode);
+ +            if (kind == PyUnicode_1BYTE_KIND) {
+ +                if (PyUnicode_IS_ASCII(unicode))
+ +                    ch = asciilib_utf16_decode(&q, e,
+ +                            PyUnicode_1BYTE_DATA(unicode), &outpos,
+ +                            native_ordering);
+ +                else
+ +                    ch = ucs1lib_utf16_decode(&q, e,
+ +                            PyUnicode_1BYTE_DATA(unicode), &outpos,
+ +                            native_ordering);
+ +            } else if (kind == PyUnicode_2BYTE_KIND) {
+ +                ch = ucs2lib_utf16_decode(&q, e,
+ +                        PyUnicode_2BYTE_DATA(unicode), &outpos,
+ +                        native_ordering);
+ +            } else {
+ +                assert(kind == PyUnicode_4BYTE_KIND);
+ +                ch = ucs4lib_utf16_decode(&q, e,
+ +                        PyUnicode_4BYTE_DATA(unicode), &outpos,
+ +                        native_ordering);
+ +            }
           }
   
- -        /* UTF-16 code pair: */
- -        if (e - q < 2) {
+ +        switch (ch)
+ +        {
+ +        case 0:
+ +            /* remaining byte at the end? (size should be even) */
+ +            if (q == e || consumed)
+ +                goto End;
+ +            errmsg = "truncated data";
+ +            startinpos = ((const char *)q) - starts;
+ +            endinpos = ((const char *)e) - starts;
+ +            break;
+ +            /* The remaining input chars are ignored if the callback
+ +               chooses to skip the input */
+ +        case 1:
+             q -= 2;
+             if (consumed)
- -                break;
++                goto End;
               errmsg = "unexpected end of data";
-             startinpos = ((const char *)q) - 2 - starts;
+             startinpos = ((const char *)q) - starts;
               endinpos = ((const char *)e) - starts;
- -            goto utf16Error;
- -        }
- -        if (0xD800 <= ch && ch <= 0xDBFF) {
- -            Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
- -            q += 2;
- -            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
- -#ifndef Py_UNICODE_WIDE
- -                *p++ = ch;
- -                *p++ = ch2;
- -#else
- -                *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
- -#endif
- -                continue;
- -            }
- -            else {
- -                errmsg = "illegal UTF-16 surrogate";
- -                startinpos = (((const char *)q)-4)-starts;
- -                endinpos = startinpos+2;
- -                goto utf16Error;
- -            }
- -
+ +            break;
+ +        case 2:
+ +            errmsg = "illegal encoding";
+ +            startinpos = ((const char *)q) - 2 - starts;
+ +            endinpos = startinpos + 2;
+ +            break;
+ +        case 3:
+ +            errmsg = "illegal UTF-16 surrogate";
+ +            startinpos = ((const char *)q) - 4 - starts;
+ +            endinpos = startinpos + 2;
+ +            break;
+ +        default:
+ +            if (unicode_putchar(&unicode, &outpos, ch) < 0)
+ +                goto onError;
+ +            continue;
           }
- -        errmsg = "illegal encoding";
- -        startinpos = (((const char *)q)-2)-starts;
- -        endinpos = startinpos+2;
- -        /* Fall through to report the error */
   
- -      utf16Error:
- -        outpos = p - PyUnicode_AS_UNICODE(unicode);
           if (unicode_decode_call_errorhandler(
                   errors,
                   &errorHandler,
author	Serhiy Storchaka <storchaka@gmail.com>
	Tue, 8 Jan 2013 21:40:52 +0000 (23:40 +0200)
committer	Serhiy Storchaka <storchaka@gmail.com>
	Tue, 8 Jan 2013 21:40:52 +0000 (23:40 +0200)
		1	2
Lib/test/test_codecs.py	patch \|	diff1 \|	diff2 \|	blob \| history
Misc/NEWS	patch \|	diff1 \|	diff2 \|	blob \| history
Objects/unicodeobject.c	patch \|	diff1 \|	diff2 \|	blob \| history