]> granicus.if.org Git - python/commitdiff
Issue #25301: The UTF-8 decoder is now up to 15 times as fast for error
authorVictor Stinner <victor.stinner@gmail.com>
Mon, 5 Oct 2015 11:43:50 +0000 (13:43 +0200)
committerVictor Stinner <victor.stinner@gmail.com>
Mon, 5 Oct 2015 11:43:50 +0000 (13:43 +0200)
handlers: ``ignore``, ``replace`` and ``surrogateescape``.

Doc/whatsnew/3.6.rst
Lib/test/test_codecs.py
Misc/NEWS
Objects/unicodeobject.c

index ca83ef91a0661b066bfbb99db9c00be92af28455..24fd822c7b0f46bac072540b575f31915e38cb4b 100644 (file)
@@ -123,6 +123,9 @@ Optimizations
 * The UTF-8 encoder is now up to 75 times as fast for error handlers:
   ``ignore``, ``replace``, ``surrogateescape``, ``surrogatepass``.
 
+* The UTF-8 decoder is now up to 15 times as fast for error handlers:
+  ``ignore``, ``replace`` and ``surrogateescape``.
+
 
 Build and C API Changes
 =======================
index bdc331e4911fe1095357cc0d50e180d70d3f1030..7b6883fcc51d374306e8afaad31198acefd5f851 100644 (file)
@@ -788,6 +788,18 @@ class UTF8Test(ReadTest, unittest.TestCase):
         self.check_state_handling_decode(self.encoding,
                                          u, u.encode(self.encoding))
 
+    def test_decode_error(self):
+        for data, error_handler, expected in (
+            (b'[\x80\xff]', 'ignore', '[]'),
+            (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
+            (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
+            (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
+        ):
+            with self.subTest(data=data, error_handler=error_handler,
+                              expected=expected):
+                self.assertEqual(data.decode(self.encoding, error_handler),
+                                 expected)
+
     def test_lone_surrogates(self):
         super().test_lone_surrogates()
         # not sure if this is making sense for
index d8093771c598b19b7f6f151ddff4bde4996cc6e9..3991d6bb86af9e7567634df33f21fb34abf90f1d 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,9 @@ Release date: XXXX-XX-XX
 Core and Builtins
 -----------------
 
+* Issue #25301: The UTF-8 decoder is now up to 15 times as fast for error
+  handlers: ``ignore``, ``replace`` and ``surrogateescape``.
+
 - Issue #24848: Fixed a number of bugs in UTF-7 decoding of misformed data.
 
 - Issue #25267: The UTF-8 encoder is now up to 75 times as fast for error
index bc982876c51429303485f1faadee3af3e45ab282..56614e6b8d9bae61450024c922eacaadaa7c5fc9 100644 (file)
@@ -4714,8 +4714,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
     Py_ssize_t startinpos;
     Py_ssize_t endinpos;
     const char *errmsg = "";
-    PyObject *errorHandler = NULL;
+    PyObject *error_handler_obj = NULL;
     PyObject *exc = NULL;
+    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
 
     if (size == 0) {
         if (consumed)
@@ -4740,6 +4741,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
     while (s < end) {
         Py_UCS4 ch;
         int kind = writer.kind;
+
         if (kind == PyUnicode_1BYTE_KIND) {
             if (PyUnicode_IS_ASCII(writer.buffer))
                 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
@@ -4778,24 +4780,52 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
             continue;
         }
 
-        if (unicode_decode_call_errorhandler_writer(
-                errors, &errorHandler,
-                "utf-8", errmsg,
-                &starts, &end, &startinpos, &endinpos, &exc, &s,
-                &writer))
-            goto onError;
+        if (error_handler == _Py_ERROR_UNKNOWN)
+            error_handler = get_error_handler(errors);
+
+        switch (error_handler) {
+        case _Py_ERROR_IGNORE:
+            s += (endinpos - startinpos);
+            break;
+
+        case _Py_ERROR_REPLACE:
+            if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
+                goto onError;
+            s += (endinpos - startinpos);
+            break;
+
+        case _Py_ERROR_SURROGATEESCAPE:
+            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
+                goto onError;
+            for (Py_ssize_t i=startinpos; i<endinpos; i++) {
+                ch = (Py_UCS4)(unsigned char)(starts[i]);
+                PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
+                                ch + 0xdc00);
+                writer.pos++;
+            }
+            s += (endinpos - startinpos);
+            break;
+
+        default:
+            if (unicode_decode_call_errorhandler_writer(
+                    errors, &error_handler_obj,
+                    "utf-8", errmsg,
+                    &starts, &end, &startinpos, &endinpos, &exc, &s,
+                    &writer))
+                goto onError;
+        }
     }
 
 End:
     if (consumed)
         *consumed = s - starts;
 
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
     return _PyUnicodeWriter_Finish(&writer);
 
 onError:
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
     _PyUnicodeWriter_Dealloc(&writer);
     return NULL;