From: Serhiy Storchaka Date: Tue, 8 Jan 2013 21:40:52 +0000 (+0200) Subject: Issue #11461: Fix the incremental UTF-16 decoder. Original patch by X-Git-Tag: v3.3.1rc1~410 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ae3b32ad6bd6326e14b4b4316af6edea8dfc9173;p=python Issue #11461: Fix the incremental UTF-16 decoder. Original patch by Amaury Forgeot d'Arc. Added tests for partial decoding of non-BMP characters. --- ae3b32ad6bd6326e14b4b4316af6edea8dfc9173 diff --cc Misc/NEWS index b40d4c7f05,f51476aab4..8a5d14d443 --- a/Misc/NEWS +++ b/Misc/NEWS @@@ -12,11 -10,9 +12,14 @@@ What's New in Python 3.3.1 Core and Builtins ----------------- + - Issue #11461: Fix the incremental UTF-16 decoder. Original patch by + Amaury Forgeot d'Arc. + +- Issue #16881: Fix Py_ARRAY_LENGTH macro for GCC < 3.1. + +- Issue #16856: Fix a segmentation fault from calling repr() on a dict with + a key whose repr raise an exception. + - Issue #16367: Fix FileIO.readall() on Windows for files larger than 2 GB. - Issue #16455: On FreeBSD and Solaris, if the locale is C, the diff --cc Objects/unicodeobject.c index 1522a16ba6,7f86bfd6df..b4fc0040b6 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@@ -5212,98 -3435,179 +5212,101 @@@ PyUnicode_DecodeUTF16Stateful(const cha byte order setting accordingly. In native mode, the leading BOM mark is skipped, in all other modes, it is copied to the output stream as-is (giving a ZWNBSP character). */ - if (bo == 0) { - if (size >= 2) { - const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; -#ifdef BYTEORDER_IS_LITTLE_ENDIAN - if (bom == 0xFEFF) { - q += 2; - bo = -1; - } - else if (bom == 0xFFFE) { - q += 2; - bo = 1; - } -#else - if (bom == 0xFEFF) { - q += 2; - bo = 1; - } - else if (bom == 0xFFFE) { - q += 2; - bo = -1; - } -#endif + if (bo == 0 && size >= 2) { + const Py_UCS4 bom = (q[1] << 8) | q[0]; + if (bom == 0xFEFF) { + q += 2; + bo = -1; + } + else if (bom == 0xFFFE) { + q += 2; + bo = 1; } + if (byteorder) + *byteorder = bo; } - if (bo == -1) { - /* force LE */ - ihi = 1; - ilo = 0; - } - else if (bo == 1) { - /* force BE */ - ihi = 0; - ilo = 1; + if (q == e) { + if (consumed) + *consumed = size; + Py_INCREF(unicode_empty); + return unicode_empty; } -#ifdef BYTEORDER_IS_LITTLE_ENDIAN - native_ordering = ilo < ihi; -#else - native_ordering = ilo > ihi; -#endif - - aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK); - while (1) { - Py_UNICODE ch; - if (e - q < 2) { - /* remaining byte at the end? (size should be even) */ - if (q == e || consumed) - break; - errmsg = "truncated data"; - startinpos = ((const char *)q) - starts; - endinpos = ((const char *)e) - starts; - outpos = p - PyUnicode_AS_UNICODE(unicode); - goto utf16Error; - /* The remaining input chars are ignored if the callback - chooses to skip the input */ - } - /* First check for possible aligned read of a C 'long'. Unaligned - reads are more expensive, better to defer to another iteration. */ - if (!((size_t) q & LONG_PTR_MASK)) { - /* Fast path for runs of non-surrogate chars. */ - register const unsigned char *_q = q; - Py_UNICODE *_p = p; - if (native_ordering) { - /* Native ordering is simple: as long as the input cannot - possibly contain a surrogate char, do an unrolled copy - of several 16-bit code points to the target object. - The non-surrogate check is done on several input bytes - at a time (as many as a C 'long' can contain). */ - while (_q < aligned_end) { - unsigned long data = * (unsigned long *) _q; - if (data & FAST_CHAR_MASK) - break; - _p[0] = ((unsigned short *) _q)[0]; - _p[1] = ((unsigned short *) _q)[1]; -#if (SIZEOF_LONG == 8) - _p[2] = ((unsigned short *) _q)[2]; - _p[3] = ((unsigned short *) _q)[3]; -#endif - _q += SIZEOF_LONG; - _p += SIZEOF_LONG / 2; - } - } - else { - /* Byteswapped ordering is similar, but we must decompose - the copy bytewise, and take care of zero'ing out the - upper bytes if the target object is in 32-bit units - (that is, in UCS-4 builds). */ - while (_q < aligned_end) { - unsigned long data = * (unsigned long *) _q; - if (data & SWAPPED_FAST_CHAR_MASK) - break; - /* Zero upper bytes in UCS-4 builds */ -#if (Py_UNICODE_SIZE > 2) - _p[0] = 0; - _p[1] = 0; -#if (SIZEOF_LONG == 8) - _p[2] = 0; - _p[3] = 0; -#endif -#endif - /* Issue #4916; UCS-4 builds on big endian machines must - fill the two last bytes of each 4-byte unit. */ -#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2) -# define OFF 2 -#else -# define OFF 0 -#endif - ((unsigned char *) _p)[OFF + 1] = _q[0]; - ((unsigned char *) _p)[OFF + 0] = _q[1]; - ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2]; - ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3]; -#if (SIZEOF_LONG == 8) - ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4]; - ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5]; - ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6]; - ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7]; -#endif -#undef OFF - _q += SIZEOF_LONG; - _p += SIZEOF_LONG / 2; - } - } - p = _p; - q = _q; - if (e - q < 2) - continue; - } - ch = (q[ihi] << 8) | q[ilo]; - q += 2; +#ifdef BYTEORDER_IS_LITTLE_ENDIAN + native_ordering = bo <= 0; +#else + native_ordering = bo >= 0; +#endif - if (ch < 0xD800 || ch > 0xDFFF) { - *p++ = ch; - continue; + /* Note: size will always be longer than the resulting Unicode + character count */ + unicode = PyUnicode_New((e - q + 1) / 2, 127); + if (!unicode) + return NULL; + + outpos = 0; + while (1) { + Py_UCS4 ch = 0; + if (e - q >= 2) { + int kind = PyUnicode_KIND(unicode); + if (kind == PyUnicode_1BYTE_KIND) { + if (PyUnicode_IS_ASCII(unicode)) + ch = asciilib_utf16_decode(&q, e, + PyUnicode_1BYTE_DATA(unicode), &outpos, + native_ordering); + else + ch = ucs1lib_utf16_decode(&q, e, + PyUnicode_1BYTE_DATA(unicode), &outpos, + native_ordering); + } else if (kind == PyUnicode_2BYTE_KIND) { + ch = ucs2lib_utf16_decode(&q, e, + PyUnicode_2BYTE_DATA(unicode), &outpos, + native_ordering); + } else { + assert(kind == PyUnicode_4BYTE_KIND); + ch = ucs4lib_utf16_decode(&q, e, + PyUnicode_4BYTE_DATA(unicode), &outpos, + native_ordering); + } } - /* UTF-16 code pair: */ - if (e - q < 2) { + switch (ch) + { + case 0: + /* remaining byte at the end? (size should be even) */ + if (q == e || consumed) + goto End; + errmsg = "truncated data"; + startinpos = ((const char *)q) - starts; + endinpos = ((const char *)e) - starts; + break; + /* The remaining input chars are ignored if the callback + chooses to skip the input */ + case 1: + q -= 2; + if (consumed) - break; ++ goto End; errmsg = "unexpected end of data"; - startinpos = ((const char *)q) - 2 - starts; + startinpos = ((const char *)q) - starts; endinpos = ((const char *)e) - starts; - goto utf16Error; - } - if (0xD800 <= ch && ch <= 0xDBFF) { - Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; - q += 2; - if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { -#ifndef Py_UNICODE_WIDE - *p++ = ch; - *p++ = ch2; -#else - *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; -#endif - continue; - } - else { - errmsg = "illegal UTF-16 surrogate"; - startinpos = (((const char *)q)-4)-starts; - endinpos = startinpos+2; - goto utf16Error; - } - + break; + case 2: + errmsg = "illegal encoding"; + startinpos = ((const char *)q) - 2 - starts; + endinpos = startinpos + 2; + break; + case 3: + errmsg = "illegal UTF-16 surrogate"; + startinpos = ((const char *)q) - 4 - starts; + endinpos = startinpos + 2; + break; + default: + if (unicode_putchar(&unicode, &outpos, ch) < 0) + goto onError; + continue; } - errmsg = "illegal encoding"; - startinpos = (((const char *)q)-2)-starts; - endinpos = startinpos+2; - /* Fall through to report the error */ - utf16Error: - outpos = p - PyUnicode_AS_UNICODE(unicode); if (unicode_decode_call_errorhandler( errors, &errorHandler,