#include <windows.h>
#endif
-/* Endianness switches; defaults to little endian */
-
-#ifdef WORDS_BIGENDIAN
-# define BYTEORDER_IS_BIG_ENDIAN
-#else
-# define BYTEORDER_IS_LITTLE_ENDIAN
-#endif
-
/* --- Globals ------------------------------------------------------------
- The globals are initialized by the _PyUnicode_Init() API and should
- not be used before calling that API.
+ NOTE: In the interpreter's initialization phase, some globals are currently
+ initialized dynamically as needed. In the process Unicode objects may
+ be created before the Unicode type is ready.
*/
#ifndef Py_DEBUG
Py_ssize_t len;
- assert(Py_REFCNT(unicode) == 1);
-
len = _PyUnicode_WSTR_LENGTH(unicode);
if (len == 0) {
- Py_INCREF(unicode_empty);
Py_DECREF(unicode);
- return unicode_empty;
+ _Py_RETURN_UNICODE_EMPTY();
}
if (len == 1) {
if (size == 0) {
if (consumed)
*consumed = 0;
- Py_INCREF(unicode_empty);
- return unicode_empty;
- return unicode;
++ _Py_RETURN_UNICODE_EMPTY();
}
- shiftOutStart = outpos = 0;
+ /* Start off assuming it's all ASCII. Widen later as necessary. */
+ _PyUnicodeWriter_Init(&writer, 0);
+ if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
+ goto onError;
+
+ shiftOutStart = 0;
e = s + size;
while (s < e) {
byte order setting accordingly. In native mode, the leading BOM
mark is skipped, in all other modes, it is copied to the output
stream as-is (giving a ZWNBSP character). */
- if (bo == 0) {
- if (size >= 4) {
- const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
- (q[iorder[1]] << 8) | q[iorder[0]];
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN
- if (bom == 0x0000FEFF) {
- q += 4;
- bo = -1;
- }
- else if (bom == 0xFFFE0000) {
- q += 4;
- bo = 1;
- }
-#else
- if (bom == 0x0000FEFF) {
- q += 4;
- bo = 1;
- }
- else if (bom == 0xFFFE0000) {
- q += 4;
- bo = -1;
- }
-#endif
+ if (bo == 0 && size >= 4) {
+ Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
+ if (bom == 0x0000FEFF) {
+ bo = -1;
+ q += 4;
+ }
+ else if (bom == 0xFFFE0000) {
+ bo = 1;
+ q += 4;
}
+ if (byteorder)
+ *byteorder = bo;
}
- if (bo == -1) {
- /* force LE */
- iorder[0] = 0;
- iorder[1] = 1;
- iorder[2] = 2;
- iorder[3] = 3;
- }
- else if (bo == 1) {
- /* force BE */
- iorder[0] = 3;
- iorder[1] = 2;
- iorder[2] = 1;
- iorder[3] = 0;
+ if (q == e) {
+ if (consumed)
+ *consumed = size;
- Py_INCREF(unicode_empty);
- return unicode_empty;
++ _Py_RETURN_UNICODE_EMPTY();
}
- /* This might be one to much, because of a BOM */
- unicode = PyUnicode_New((size+3)/4, 127);
- if (!unicode)
- return NULL;
- if (size == 0)
- return unicode;
- outpos = 0;
+#ifdef WORDS_BIGENDIAN
+ le = bo < 0;
+#else
+ le = bo <= 0;
+#endif
- while (q < e) {
- Py_UCS4 ch;
- /* remaining bytes at the end? (size should be divisible by 4) */
- if (e-q<4) {
- if (consumed)
+ _PyUnicodeWriter_Init(&writer, 0);
+ if (_PyUnicodeWriter_Prepare(&writer, (e - q + 3) / 4, 127) == -1)
+ goto onError;
+
+ while (1) {
+ Py_UCS4 ch = 0;
+ Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
+
+ if (e - q >= 4) {
+ enum PyUnicode_Kind kind = writer.kind;
+ void *data = writer.data;
+ const unsigned char *last = e - 4;
+ Py_ssize_t pos = writer.pos;
+ if (le) {
+ do {
+ ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
+ if (ch > maxch)
+ break;
+ PyUnicode_WRITE(kind, data, pos++, ch);
+ q += 4;
+ } while (q <= last);
+ }
+ else {
+ do {
+ ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
+ if (ch > maxch)
+ break;
+ PyUnicode_WRITE(kind, data, pos++, ch);
+ q += 4;
+ } while (q <= last);
+ }
+ writer.pos = pos;
+ }
+
+ if (ch <= maxch) {
+ if (q == e || consumed)
break;
+ /* remaining bytes at the end? (size should be divisible by 4) */
errmsg = "truncated data";
- startinpos = ((const char *)q)-starts;
- endinpos = ((const char *)e)-starts;
- goto utf32Error;
- /* The remaining input chars are ignored if the callback
- chooses to skip the input */
+ startinpos = ((const char *)q) - starts;
+ endinpos = ((const char *)e) - starts;
}
- ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
- (q[iorder[1]] << 8) | q[iorder[0]];
-
- if (ch >= 0x110000)
- {
+ else {
+ if (ch < 0x110000) {
+ if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
+ goto onError;
+ PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
+ writer.pos++;
+ q += 4;
+ continue;
+ }
errmsg = "codepoint not in range(0x110000)";
- startinpos = ((const char *)q)-starts;
- endinpos = startinpos+4;
- goto utf32Error;
+ startinpos = ((const char *)q) - starts;
+ endinpos = startinpos + 4;
}
- if (unicode_putchar(&unicode, &outpos, ch) < 0)
- goto onError;
- q += 4;
- continue;
- utf32Error:
- if (unicode_decode_call_errorhandler(
+
+ /* The remaining input chars are ignored if the callback
+ chooses to skip the input */
+ if (unicode_decode_call_errorhandler_writer(
errors, &errorHandler,
"utf32", errmsg,
&starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
if (q == e) {
if (consumed)
*consumed = size;
- Py_INCREF(unicode_empty);
- return unicode_empty;
+ _Py_RETURN_UNICODE_EMPTY();
}
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+#if PY_LITTLE_ENDIAN
native_ordering = bo <= 0;
#else
native_ordering = bo >= 0;
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
Py_ssize_t len;
- Py_ssize_t i;
len = length_of_escaped_ascii_string(s, size);
- if (len == 0) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
++ if (len == 0)
++ _Py_RETURN_UNICODE_EMPTY();
/* After length_of_escaped_ascii_string() there are two alternatives,
either the string is pure ASCII with named escapes like \n, etc.
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
- if (size == 0) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
++ if (size == 0)
++ _Py_RETURN_UNICODE_EMPTY();
+
/* Escaped strings will always be longer than the resulting
Unicode string, so we start with size here and then reduce the
length after conversion to the true value. (But decoding error
1))
return NULL;
- if (size == 0) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
++ if (size == 0)
++ _Py_RETURN_UNICODE_EMPTY();
+
/* XXX overflow detection missing */
- v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
- if (v == NULL)
+ _PyUnicodeWriter_Init(&writer, 0);
+ if (_PyUnicodeWriter_Prepare(&writer, (size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127) == -1)
goto onError;
- if (PyUnicode_GET_LENGTH(v) == 0)
- return v;
- outpos = 0;
end = s + size;
while (s < end) {
if (mapping == NULL)
return PyUnicode_DecodeLatin1(s, size, errors);
- if (size == 0) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
- v = PyUnicode_New(size, 127);
- if (v == NULL)
- goto onError;
+ if (size == 0)
- return v;
- outpos = 0;
++ _Py_RETURN_UNICODE_EMPTY();
+ _PyUnicodeWriter_Init(&writer, 0);
+ if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
+ goto onError;
+
e = s + size;
if (PyUnicode_CheckExact(mapping)) {
Py_ssize_t maplen;