Issue #25301: The UTF-8 decoder is now up to 15 times as fast for error

author Victor Stinner <victor.stinner@gmail.com>

Mon, 5 Oct 2015 11:43:50 +0000 (13:43 +0200)

committer Victor Stinner <victor.stinner@gmail.com>

Mon, 5 Oct 2015 11:43:50 +0000 (13:43 +0200)
author Victor Stinner <victor.stinner@gmail.com>
Mon, 5 Oct 2015 11:43:50 +0000 (13:43 +0200)
committer Victor Stinner <victor.stinner@gmail.com>
Mon, 5 Oct 2015 11:43:50 +0000 (13:43 +0200)
diff --git a/Doc/whatsnew/3.6.rst b/Doc/whatsnew/3.6.rst

index ca83ef91a0661b066bfbb99db9c00be92af28455..24fd822c7b0f46bac072540b575f31915e38cb4b 100644 (file)
--- a/Doc/whatsnew/3.6.rst
+++ b/Doc/whatsnew/3.6.rst
@@ -123,6 +123,9 @@ Optimizations
  * The UTF-8 encoder is now up to 75 times as fast for error handlers:
    ``ignore``, ``replace``, ``surrogateescape``, ``surrogatepass``.
  
+* The UTF-8 decoder is now up to 15 times as fast for error handlers:
+  ``ignore``, ``replace`` and ``surrogateescape``.
+
  
  Build and C API Changes
  =======================
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py

index bdc331e4911fe1095357cc0d50e180d70d3f1030..7b6883fcc51d374306e8afaad31198acefd5f851 100644 (file)
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -788,6 +788,18 @@ class UTF8Test(ReadTest, unittest.TestCase):
          self.check_state_handling_decode(self.encoding,
                                           u, u.encode(self.encoding))
  
+    def test_decode_error(self):
+        for data, error_handler, expected in (
+            (b'[\x80\xff]', 'ignore', '[]'),
+            (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
+            (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
+            (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
+        ):
+            with self.subTest(data=data, error_handler=error_handler,
+                              expected=expected):
+                self.assertEqual(data.decode(self.encoding, error_handler),
+                                 expected)
+
      def test_lone_surrogates(self):
          super().test_lone_surrogates()
          # not sure if this is making sense for
diff --git a/Misc/NEWS b/Misc/NEWS

index d8093771c598b19b7f6f151ddff4bde4996cc6e9..3991d6bb86af9e7567634df33f21fb34abf90f1d 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,9 @@ Release date: XXXX-XX-XX
  Core and Builtins
  -----------------
  
+* Issue #25301: The UTF-8 decoder is now up to 15 times as fast for error
+  handlers: ``ignore``, ``replace`` and ``surrogateescape``.
+
  - Issue #24848: Fixed a number of bugs in UTF-7 decoding of misformed data.
  
  - Issue #25267: The UTF-8 encoder is now up to 75 times as fast for error
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index bc982876c51429303485f1faadee3af3e45ab282..56614e6b8d9bae61450024c922eacaadaa7c5fc9 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -4714,8 +4714,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
      Py_ssize_t startinpos;
      Py_ssize_t endinpos;
      const char *errmsg = "";
-    PyObject *errorHandler = NULL;
+    PyObject *error_handler_obj = NULL;
      PyObject *exc = NULL;
+    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
  
      if (size == 0) {
          if (consumed)
@@ -4740,6 +4741,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
      while (s < end) {
          Py_UCS4 ch;
          int kind = writer.kind;
+
          if (kind == PyUnicode_1BYTE_KIND) {
              if (PyUnicode_IS_ASCII(writer.buffer))
                  ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
@@ -4778,24 +4780,52 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
              continue;
          }
  
-        if (unicode_decode_call_errorhandler_writer(
-                errors, &errorHandler,
-                "utf-8", errmsg,
-                &starts, &end, &startinpos, &endinpos, &exc, &s,
-                &writer))
-            goto onError;
+        if (error_handler == _Py_ERROR_UNKNOWN)
+            error_handler = get_error_handler(errors);
+
+        switch (error_handler) {
+        case _Py_ERROR_IGNORE:
+            s += (endinpos - startinpos);
+            break;
+
+        case _Py_ERROR_REPLACE:
+            if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
+                goto onError;
+            s += (endinpos - startinpos);
+            break;
+
+        case _Py_ERROR_SURROGATEESCAPE:
+            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
+                goto onError;
+            for (Py_ssize_t i=startinpos; i<endinpos; i++) {
+                ch = (Py_UCS4)(unsigned char)(starts[i]);
+                PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
+                                ch + 0xdc00);
+                writer.pos++;
+            }
+            s += (endinpos - startinpos);
+            break;
+
+        default:
+            if (unicode_decode_call_errorhandler_writer(
+                    errors, &error_handler_obj,
+                    "utf-8", errmsg,
+                    &starts, &end, &startinpos, &endinpos, &exc, &s,
+                    &writer))
+                goto onError;
+        }
      }
  
  End:
      if (consumed)
          *consumed = s - starts;
  
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
      Py_XDECREF(exc);
      return _PyUnicodeWriter_Finish(&writer);
  
  onError:
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
      Py_XDECREF(exc);
      _PyUnicodeWriter_Dealloc(&writer);
      return NULL;
author	Victor Stinner <victor.stinner@gmail.com>
	Mon, 5 Oct 2015 11:43:50 +0000 (13:43 +0200)
committer	Victor Stinner <victor.stinner@gmail.com>
	Mon, 5 Oct 2015 11:43:50 +0000 (13:43 +0200)
Doc/whatsnew/3.6.rst		patch \| blob \| history
Lib/test/test_codecs.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history
Objects/unicodeobject.c		patch \| blob \| history