From: Inada Naoki Date: Mon, 24 Jun 2019 03:30:24 +0000 (+0900) Subject: bpo-37348: optimize decoding ASCII string (GH-14283) X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=770847a7db33b3d4c451b42372b6942687aa6121;p=python bpo-37348: optimize decoding ASCII string (GH-14283) `_PyUnicode_Writer` is a relatively complex structure. Initializing it is significant overhead when decoding short ASCII string. --- diff --git a/Misc/NEWS.d/next/Core and Builtins/2019-06-23-00-26-30.bpo-37348.pp8P-x.rst b/Misc/NEWS.d/next/Core and Builtins/2019-06-23-00-26-30.bpo-37348.pp8P-x.rst new file mode 100644 index 0000000000..5859837d23 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2019-06-23-00-26-30.bpo-37348.pp8P-x.rst @@ -0,0 +1,2 @@ +Optimized decoding short ASCII string with UTF-8 and ascii codecs. +``b"foo".decode()`` is about 15% faster. Patch by Inada Naoki. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 4f83625905..625be4b559 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -265,6 +265,8 @@ unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value, /* Forward declaration */ static inline int _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch); +static inline void +_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer); static PyObject * unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler, const char *errors); @@ -4877,16 +4879,6 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, _Py_error_handler error_handler, const char *errors, Py_ssize_t *consumed) { - _PyUnicodeWriter writer; - const char *starts = s; - const char *end = s + size; - - Py_ssize_t startinpos; - Py_ssize_t endinpos; - const char *errmsg = ""; - PyObject *error_handler_obj = NULL; - PyObject *exc = NULL; - if (size == 0) { if (consumed) *consumed = 0; @@ -4900,13 +4892,29 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, return get_latin1_char((unsigned char)s[0]); } - _PyUnicodeWriter_Init(&writer); - writer.min_length = size; - if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) - goto onError; + const char *starts = s; + const char *end = s + size; + + // fast path: try ASCII string. + PyObject *u = PyUnicode_New(size, 127); + if (u == NULL) { + return NULL; + } + s += ascii_decode(s, end, PyUnicode_DATA(u)); + if (s == end) { + return u; + } + + // Use _PyUnicodeWriter after fast path is failed. + _PyUnicodeWriter writer; + _PyUnicodeWriter_InitWithBuffer(&writer, u); + writer.pos = s - starts; + + Py_ssize_t startinpos, endinpos; + const char *errmsg = ""; + PyObject *error_handler_obj = NULL; + PyObject *exc = NULL; - writer.pos = ascii_decode(s, end, writer.data); - s += writer.pos; while (s < end) { Py_UCS4 ch; int kind = writer.kind; @@ -6451,7 +6459,7 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, length after conversion to the true value. (But decoding error handler might have to resize the string) */ _PyUnicodeWriter_Init(&writer); - writer.min_length = size; + writer.min_length = size; if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) { goto onError; } @@ -6975,13 +6983,7 @@ PyUnicode_DecodeASCII(const char *s, const char *errors) { const char *starts = s; - _PyUnicodeWriter writer; - int kind; - void *data; - Py_ssize_t startinpos; - Py_ssize_t endinpos; - Py_ssize_t outpos; - const char *e; + const char *e = s + size; PyObject *error_handler_obj = NULL; PyObject *exc = NULL; _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; @@ -6993,20 +6995,25 @@ PyUnicode_DecodeASCII(const char *s, if (size == 1 && (unsigned char)s[0] < 128) return get_latin1_char((unsigned char)s[0]); - _PyUnicodeWriter_Init(&writer); - writer.min_length = size; - if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) + // Shortcut for simple case + PyObject *u = PyUnicode_New(size, 127); + if (u == NULL) { return NULL; + } + Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_DATA(u)); + if (outpos == size) { + return u; + } - e = s + size; - data = writer.data; - outpos = ascii_decode(s, e, (Py_UCS1 *)data); + _PyUnicodeWriter writer; + _PyUnicodeWriter_InitWithBuffer(&writer, u); writer.pos = outpos; - if (writer.pos == size) - return _PyUnicodeWriter_Finish(&writer); - s += writer.pos; - kind = writer.kind; + s += outpos; + int kind = writer.kind; + void *data = writer.data; + Py_ssize_t startinpos, endinpos; + while (s < e) { unsigned char c = (unsigned char)*s; if (c < 128) { @@ -13506,6 +13513,16 @@ _PyUnicodeWriter_Init(_PyUnicodeWriter *writer) assert(writer->kind <= PyUnicode_1BYTE_KIND); } +// Initialize _PyUnicodeWriter with initial buffer +static inline void +_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer) +{ + memset(writer, 0, sizeof(*writer)); + writer->buffer = buffer; + _PyUnicodeWriter_Update(writer); + writer->min_length = writer->size; +} + int _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, Py_ssize_t length, Py_UCS4 maxchar)