From: Serhiy Storchaka <storchaka@gmail.com>
Date: Tue, 8 Jan 2013 21:40:52 +0000 (+0200)
Subject: Issue #11461: Fix the incremental UTF-16 decoder. Original patch by
X-Git-Tag: v3.3.1rc1~410
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ae3b32ad6bd6326e14b4b4316af6edea8dfc9173;p=python

Issue #11461: Fix the incremental UTF-16 decoder. Original patch by
Amaury Forgeot d'Arc. Added tests for partial decoding of non-BMP
characters.
---

ae3b32ad6bd6326e14b4b4316af6edea8dfc9173
diff --cc Misc/NEWS
index b40d4c7f05,f51476aab4..8a5d14d443
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@@ -12,11 -10,9 +12,14 @@@ What's New in Python 3.3.1
  Core and Builtins
  -----------------
  
+ - Issue #11461: Fix the incremental UTF-16 decoder. Original patch by
+   Amaury Forgeot d'Arc.
+ 
 +- Issue #16881: Fix Py_ARRAY_LENGTH macro for GCC < 3.1.
 +
 +- Issue #16856: Fix a segmentation fault from calling repr() on a dict with
 +  a key whose repr raise an exception.
 +
  - Issue #16367: Fix FileIO.readall() on Windows for files larger than 2 GB.
  
  - Issue #16455: On FreeBSD and Solaris, if the locale is C, the
diff --cc Objects/unicodeobject.c
index 1522a16ba6,7f86bfd6df..b4fc0040b6
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@@ -5212,98 -3435,179 +5212,101 @@@ PyUnicode_DecodeUTF16Stateful(const cha
         byte order setting accordingly. In native mode, the leading BOM
         mark is skipped, in all other modes, it is copied to the output
         stream as-is (giving a ZWNBSP character). */
 -    if (bo == 0) {
 -        if (size >= 2) {
 -            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
 -#ifdef BYTEORDER_IS_LITTLE_ENDIAN
 -            if (bom == 0xFEFF) {
 -                q += 2;
 -                bo = -1;
 -            }
 -            else if (bom == 0xFFFE) {
 -                q += 2;
 -                bo = 1;
 -            }
 -#else
 -            if (bom == 0xFEFF) {
 -                q += 2;
 -                bo = 1;
 -            }
 -            else if (bom == 0xFFFE) {
 -                q += 2;
 -                bo = -1;
 -            }
 -#endif
 +    if (bo == 0 && size >= 2) {
 +        const Py_UCS4 bom = (q[1] << 8) | q[0];
 +        if (bom == 0xFEFF) {
 +            q += 2;
 +            bo = -1;
 +        }
 +        else if (bom == 0xFFFE) {
 +            q += 2;
 +            bo = 1;
          }
 +        if (byteorder)
 +            *byteorder = bo;
      }
  
 -    if (bo == -1) {
 -        /* force LE */
 -        ihi = 1;
 -        ilo = 0;
 -    }
 -    else if (bo == 1) {
 -        /* force BE */
 -        ihi = 0;
 -        ilo = 1;
 +    if (q == e) {
 +        if (consumed)
 +            *consumed = size;
 +        Py_INCREF(unicode_empty);
 +        return unicode_empty;
      }
 -#ifdef BYTEORDER_IS_LITTLE_ENDIAN
 -    native_ordering = ilo < ihi;
 -#else
 -    native_ordering = ilo > ihi;
 -#endif
 -
 -    aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
 -    while (1) {
 -        Py_UNICODE ch;
 -        if (e - q < 2) {
 -            /* remaining byte at the end? (size should be even) */
 -            if (q == e || consumed)
 -                break;
 -            errmsg = "truncated data";
 -            startinpos = ((const char *)q) - starts;
 -            endinpos = ((const char *)e) - starts;
 -            outpos = p - PyUnicode_AS_UNICODE(unicode);
 -            goto utf16Error;
 -            /* The remaining input chars are ignored if the callback
 -               chooses to skip the input */
 -        }
 -        /* First check for possible aligned read of a C 'long'. Unaligned
 -           reads are more expensive, better to defer to another iteration. */
 -        if (!((size_t) q & LONG_PTR_MASK)) {
 -            /* Fast path for runs of non-surrogate chars. */
 -            register const unsigned char *_q = q;
 -            Py_UNICODE *_p = p;
 -            if (native_ordering) {
 -                /* Native ordering is simple: as long as the input cannot
 -                   possibly contain a surrogate char, do an unrolled copy
 -                   of several 16-bit code points to the target object.
 -                   The non-surrogate check is done on several input bytes
 -                   at a time (as many as a C 'long' can contain). */
 -                while (_q < aligned_end) {
 -                    unsigned long data = * (unsigned long *) _q;
 -                    if (data & FAST_CHAR_MASK)
 -                        break;
 -                    _p[0] = ((unsigned short *) _q)[0];
 -                    _p[1] = ((unsigned short *) _q)[1];
 -#if (SIZEOF_LONG == 8)
 -                    _p[2] = ((unsigned short *) _q)[2];
 -                    _p[3] = ((unsigned short *) _q)[3];
 -#endif
 -                    _q += SIZEOF_LONG;
 -                    _p += SIZEOF_LONG / 2;
 -                }
 -            }
 -            else {
 -                /* Byteswapped ordering is similar, but we must decompose
 -                   the copy bytewise, and take care of zero'ing out the
 -                   upper bytes if the target object is in 32-bit units
 -                   (that is, in UCS-4 builds). */
 -                while (_q < aligned_end) {
 -                    unsigned long data = * (unsigned long *) _q;
 -                    if (data & SWAPPED_FAST_CHAR_MASK)
 -                        break;
 -                    /* Zero upper bytes in UCS-4 builds */
 -#if (Py_UNICODE_SIZE > 2)
 -                    _p[0] = 0;
 -                    _p[1] = 0;
 -#if (SIZEOF_LONG == 8)
 -                    _p[2] = 0;
 -                    _p[3] = 0;
 -#endif
 -#endif
 -                    /* Issue #4916; UCS-4 builds on big endian machines must
 -                       fill the two last bytes of each 4-byte unit. */
 -#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
 -# define OFF 2
 -#else
 -# define OFF 0
 -#endif
 -                    ((unsigned char *) _p)[OFF + 1] = _q[0];
 -                    ((unsigned char *) _p)[OFF + 0] = _q[1];
 -                    ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
 -                    ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
 -#if (SIZEOF_LONG == 8)
 -                    ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
 -                    ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
 -                    ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
 -                    ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
 -#endif
 -#undef OFF
 -                    _q += SIZEOF_LONG;
 -                    _p += SIZEOF_LONG / 2;
 -                }
 -            }
 -            p = _p;
 -            q = _q;
 -            if (e - q < 2)
 -                continue;
 -        }
 -        ch = (q[ihi] << 8) | q[ilo];
  
 -        q += 2;
 +#ifdef BYTEORDER_IS_LITTLE_ENDIAN
 +    native_ordering = bo <= 0;
 +#else
 +    native_ordering = bo >= 0;
 +#endif
  
 -        if (ch < 0xD800 || ch > 0xDFFF) {
 -            *p++ = ch;
 -            continue;
 +    /* Note: size will always be longer than the resulting Unicode
 +       character count */
 +    unicode = PyUnicode_New((e - q + 1) / 2, 127);
 +    if (!unicode)
 +        return NULL;
 +
 +    outpos = 0;
 +    while (1) {
 +        Py_UCS4 ch = 0;
 +        if (e - q >= 2) {
 +            int kind = PyUnicode_KIND(unicode);
 +            if (kind == PyUnicode_1BYTE_KIND) {
 +                if (PyUnicode_IS_ASCII(unicode))
 +                    ch = asciilib_utf16_decode(&q, e,
 +                            PyUnicode_1BYTE_DATA(unicode), &outpos,
 +                            native_ordering);
 +                else
 +                    ch = ucs1lib_utf16_decode(&q, e,
 +                            PyUnicode_1BYTE_DATA(unicode), &outpos,
 +                            native_ordering);
 +            } else if (kind == PyUnicode_2BYTE_KIND) {
 +                ch = ucs2lib_utf16_decode(&q, e,
 +                        PyUnicode_2BYTE_DATA(unicode), &outpos,
 +                        native_ordering);
 +            } else {
 +                assert(kind == PyUnicode_4BYTE_KIND);
 +                ch = ucs4lib_utf16_decode(&q, e,
 +                        PyUnicode_4BYTE_DATA(unicode), &outpos,
 +                        native_ordering);
 +            }
          }
  
 -        /* UTF-16 code pair: */
 -        if (e - q < 2) {
 +        switch (ch)
 +        {
 +        case 0:
 +            /* remaining byte at the end? (size should be even) */
 +            if (q == e || consumed)
 +                goto End;
 +            errmsg = "truncated data";
 +            startinpos = ((const char *)q) - starts;
 +            endinpos = ((const char *)e) - starts;
 +            break;
 +            /* The remaining input chars are ignored if the callback
 +               chooses to skip the input */
 +        case 1:
+             q -= 2;
+             if (consumed)
 -                break;
++                goto End;
              errmsg = "unexpected end of data";
-             startinpos = ((const char *)q) - 2 - starts;
+             startinpos = ((const char *)q) - starts;
              endinpos = ((const char *)e) - starts;
 -            goto utf16Error;
 -        }
 -        if (0xD800 <= ch && ch <= 0xDBFF) {
 -            Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
 -            q += 2;
 -            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
 -#ifndef Py_UNICODE_WIDE
 -                *p++ = ch;
 -                *p++ = ch2;
 -#else
 -                *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
 -#endif
 -                continue;
 -            }
 -            else {
 -                errmsg = "illegal UTF-16 surrogate";
 -                startinpos = (((const char *)q)-4)-starts;
 -                endinpos = startinpos+2;
 -                goto utf16Error;
 -            }
 -
 +            break;
 +        case 2:
 +            errmsg = "illegal encoding";
 +            startinpos = ((const char *)q) - 2 - starts;
 +            endinpos = startinpos + 2;
 +            break;
 +        case 3:
 +            errmsg = "illegal UTF-16 surrogate";
 +            startinpos = ((const char *)q) - 4 - starts;
 +            endinpos = startinpos + 2;
 +            break;
 +        default:
 +            if (unicode_putchar(&unicode, &outpos, ch) < 0)
 +                goto onError;
 +            continue;
          }
 -        errmsg = "illegal encoding";
 -        startinpos = (((const char *)q)-2)-starts;
 -        endinpos = startinpos+2;
 -        /* Fall through to report the error */
  
 -      utf16Error:
 -        outpos = p - PyUnicode_AS_UNICODE(unicode);
          if (unicode_decode_call_errorhandler(
                  errors,
                  &errorHandler,