byte order setting accordingly. In native mode, the leading BOM
mark is skipped, in all other modes, it is copied to the output
stream as-is (giving a ZWNBSP character). */
- if (bo == 0) {
- if (size >= 2) {
- const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN
- if (bom == 0xFEFF) {
- q += 2;
- bo = -1;
- }
- else if (bom == 0xFFFE) {
- q += 2;
- bo = 1;
- }
-#else
- if (bom == 0xFEFF) {
- q += 2;
- bo = 1;
- }
- else if (bom == 0xFFFE) {
- q += 2;
- bo = -1;
- }
-#endif
+ if (bo == 0 && size >= 2) {
+ const Py_UCS4 bom = (q[1] << 8) | q[0];
+ if (bom == 0xFEFF) {
+ q += 2;
+ bo = -1;
+ }
+ else if (bom == 0xFFFE) {
+ q += 2;
+ bo = 1;
}
+ if (byteorder)
+ *byteorder = bo;
}
- if (bo == -1) {
- /* force LE */
- ihi = 1;
- ilo = 0;
- }
- else if (bo == 1) {
- /* force BE */
- ihi = 0;
- ilo = 1;
+ if (q == e) {
+ if (consumed)
+ *consumed = size;
+ Py_INCREF(unicode_empty);
+ return unicode_empty;
}
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN
- native_ordering = ilo < ihi;
-#else
- native_ordering = ilo > ihi;
-#endif
-
- aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
- while (1) {
- Py_UNICODE ch;
- if (e - q < 2) {
- /* remaining byte at the end? (size should be even) */
- if (q == e || consumed)
- break;
- errmsg = "truncated data";
- startinpos = ((const char *)q) - starts;
- endinpos = ((const char *)e) - starts;
- outpos = p - PyUnicode_AS_UNICODE(unicode);
- goto utf16Error;
- /* The remaining input chars are ignored if the callback
- chooses to skip the input */
- }
- /* First check for possible aligned read of a C 'long'. Unaligned
- reads are more expensive, better to defer to another iteration. */
- if (!((size_t) q & LONG_PTR_MASK)) {
- /* Fast path for runs of non-surrogate chars. */
- register const unsigned char *_q = q;
- Py_UNICODE *_p = p;
- if (native_ordering) {
- /* Native ordering is simple: as long as the input cannot
- possibly contain a surrogate char, do an unrolled copy
- of several 16-bit code points to the target object.
- The non-surrogate check is done on several input bytes
- at a time (as many as a C 'long' can contain). */
- while (_q < aligned_end) {
- unsigned long data = * (unsigned long *) _q;
- if (data & FAST_CHAR_MASK)
- break;
- _p[0] = ((unsigned short *) _q)[0];
- _p[1] = ((unsigned short *) _q)[1];
-#if (SIZEOF_LONG == 8)
- _p[2] = ((unsigned short *) _q)[2];
- _p[3] = ((unsigned short *) _q)[3];
-#endif
- _q += SIZEOF_LONG;
- _p += SIZEOF_LONG / 2;
- }
- }
- else {
- /* Byteswapped ordering is similar, but we must decompose
- the copy bytewise, and take care of zero'ing out the
- upper bytes if the target object is in 32-bit units
- (that is, in UCS-4 builds). */
- while (_q < aligned_end) {
- unsigned long data = * (unsigned long *) _q;
- if (data & SWAPPED_FAST_CHAR_MASK)
- break;
- /* Zero upper bytes in UCS-4 builds */
-#if (Py_UNICODE_SIZE > 2)
- _p[0] = 0;
- _p[1] = 0;
-#if (SIZEOF_LONG == 8)
- _p[2] = 0;
- _p[3] = 0;
-#endif
-#endif
- /* Issue #4916; UCS-4 builds on big endian machines must
- fill the two last bytes of each 4-byte unit. */
-#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
-# define OFF 2
-#else
-# define OFF 0
-#endif
- ((unsigned char *) _p)[OFF + 1] = _q[0];
- ((unsigned char *) _p)[OFF + 0] = _q[1];
- ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
- ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
-#if (SIZEOF_LONG == 8)
- ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
- ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
- ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
- ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
-#endif
-#undef OFF
- _q += SIZEOF_LONG;
- _p += SIZEOF_LONG / 2;
- }
- }
- p = _p;
- q = _q;
- if (e - q < 2)
- continue;
- }
- ch = (q[ihi] << 8) | q[ilo];
- q += 2;
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+ native_ordering = bo <= 0;
+#else
+ native_ordering = bo >= 0;
+#endif
- if (ch < 0xD800 || ch > 0xDFFF) {
- *p++ = ch;
- continue;
+ /* Note: size will always be longer than the resulting Unicode
+ character count */
+ unicode = PyUnicode_New((e - q + 1) / 2, 127);
+ if (!unicode)
+ return NULL;
+
+ outpos = 0;
+ while (1) {
+ Py_UCS4 ch = 0;
+ if (e - q >= 2) {
+ int kind = PyUnicode_KIND(unicode);
+ if (kind == PyUnicode_1BYTE_KIND) {
+ if (PyUnicode_IS_ASCII(unicode))
+ ch = asciilib_utf16_decode(&q, e,
+ PyUnicode_1BYTE_DATA(unicode), &outpos,
+ native_ordering);
+ else
+ ch = ucs1lib_utf16_decode(&q, e,
+ PyUnicode_1BYTE_DATA(unicode), &outpos,
+ native_ordering);
+ } else if (kind == PyUnicode_2BYTE_KIND) {
+ ch = ucs2lib_utf16_decode(&q, e,
+ PyUnicode_2BYTE_DATA(unicode), &outpos,
+ native_ordering);
+ } else {
+ assert(kind == PyUnicode_4BYTE_KIND);
+ ch = ucs4lib_utf16_decode(&q, e,
+ PyUnicode_4BYTE_DATA(unicode), &outpos,
+ native_ordering);
+ }
}
- /* UTF-16 code pair: */
- if (e - q < 2) {
+ switch (ch)
+ {
+ case 0:
+ /* remaining byte at the end? (size should be even) */
+ if (q == e || consumed)
+ goto End;
+ errmsg = "truncated data";
+ startinpos = ((const char *)q) - starts;
+ endinpos = ((const char *)e) - starts;
+ break;
+ /* The remaining input chars are ignored if the callback
+ chooses to skip the input */
+ case 1:
+ q -= 2;
+ if (consumed)
- break;
++ goto End;
errmsg = "unexpected end of data";
- startinpos = ((const char *)q) - 2 - starts;
+ startinpos = ((const char *)q) - starts;
endinpos = ((const char *)e) - starts;
- goto utf16Error;
- }
- if (0xD800 <= ch && ch <= 0xDBFF) {
- Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
- q += 2;
- if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
-#ifndef Py_UNICODE_WIDE
- *p++ = ch;
- *p++ = ch2;
-#else
- *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
-#endif
- continue;
- }
- else {
- errmsg = "illegal UTF-16 surrogate";
- startinpos = (((const char *)q)-4)-starts;
- endinpos = startinpos+2;
- goto utf16Error;
- }
-
+ break;
+ case 2:
+ errmsg = "illegal encoding";
+ startinpos = ((const char *)q) - 2 - starts;
+ endinpos = startinpos + 2;
+ break;
+ case 3:
+ errmsg = "illegal UTF-16 surrogate";
+ startinpos = ((const char *)q) - 4 - starts;
+ endinpos = startinpos + 2;
+ break;
+ default:
+ if (unicode_putchar(&unicode, &outpos, ch) < 0)
+ goto onError;
+ continue;
}
- errmsg = "illegal encoding";
- startinpos = (((const char *)q)-2)-starts;
- endinpos = startinpos+2;
- /* Fall through to report the error */
- utf16Error:
- outpos = p - PyUnicode_AS_UNICODE(unicode);
if (unicode_decode_call_errorhandler(
errors,
&errorHandler,